diff options
-rw-r--r-- | arch/ia64/include/asm/cputime.h | 2 | ||||
-rw-r--r-- | arch/ia64/kernel/time.c | 22 | ||||
-rw-r--r-- | arch/powerpc/include/asm/cputime.h | 2 | ||||
-rw-r--r-- | arch/powerpc/kernel/time.c | 20 | ||||
-rw-r--r-- | arch/s390/include/asm/cputime.h | 1 | ||||
-rw-r--r-- | arch/s390/kernel/vtime.c | 13 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 4 | ||||
-rw-r--r-- | fs/proc/array.c | 4 | ||||
-rw-r--r-- | include/linux/hardirq.h | 15 | ||||
-rw-r--r-- | include/linux/kernel_stat.h | 17 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 12 | ||||
-rw-r--r-- | include/linux/sched.h | 49 | ||||
-rw-r--r-- | include/linux/vtime.h | 48 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 24 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 4 | ||||
-rw-r--r-- | kernel/sched/auto_group.h | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 11 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 131 | ||||
-rw-r--r-- | kernel/sched/debug.c | 36 | ||||
-rw-r--r-- | kernel/sched/fair.c | 914 | ||||
-rw-r--r-- | kernel/sched/features.h | 5 | ||||
-rw-r--r-- | kernel/sched/sched.h | 60 | ||||
-rw-r--r-- | kernel/softirq.c | 6 | ||||
-rw-r--r-- | kernel/sys.c | 6 |
26 files changed, 1082 insertions, 335 deletions
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 3deac956d32..7fcf7f08ab0 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -103,5 +103,7 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) #define cputime64_to_clock_t(__ct) \ cputime_to_clock_t((__force cputime_t)__ct) +extern void arch_vtime_task_switch(struct task_struct *tsk); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __IA64_CPUTIME_H */ diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index f6388216080..b1995efbfd2 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -83,7 +83,7 @@ static struct clocksource *itc_clocksource; extern cputime_t cycle_to_cputime(u64 cyc); -static void vtime_account_user(struct task_struct *tsk) +void vtime_account_user(struct task_struct *tsk) { cputime_t delta_utime; struct thread_info *ti = task_thread_info(tsk); @@ -100,18 +100,11 @@ static void vtime_account_user(struct task_struct *tsk) * accumulated times to the current process, and to prepare accounting on * the next process. */ -void vtime_task_switch(struct task_struct *prev) +void arch_vtime_task_switch(struct task_struct *prev) { struct thread_info *pi = task_thread_info(prev); struct thread_info *ni = task_thread_info(current); - if (idle_task(smp_processor_id()) != prev) - vtime_account_system(prev); - else - vtime_account_idle(prev); - - vtime_account_user(prev); - pi->ac_stamp = ni->ac_stamp; ni->ac_stime = ni->ac_utime = 0; } @@ -126,6 +119,8 @@ static cputime_t vtime_delta(struct task_struct *tsk) cputime_t delta_stime; __u64 now; + WARN_ON_ONCE(!irqs_disabled()); + now = ia64_get_itc(); delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); @@ -147,15 +142,6 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(vtime_delta(tsk)); } -/* - * Called from the timer interrupt handler to charge accumulated user time - * to the current process. Must be called with interrupts disabled. - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - vtime_account_user(p); -} - #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ static irqreturn_t diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 487d46ff68a..483733bd06d 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -228,6 +228,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk) #define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) +static inline void arch_vtime_task_switch(struct task_struct *tsk) { } + #endif /* __KERNEL__ */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __POWERPC_CPUTIME_H */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index ce4cb772dc7..b3b14352b05 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -297,6 +297,8 @@ static u64 vtime_delta(struct task_struct *tsk, u64 now, nowscaled, deltascaled; u64 udelta, delta, user_scaled; + WARN_ON_ONCE(!irqs_disabled()); + now = mftb(); nowscaled = read_spurr(now); get_paca()->system_time += now - get_paca()->starttime; @@ -355,15 +357,15 @@ void vtime_account_idle(struct task_struct *tsk) } /* - * Transfer the user and system times accumulated in the paca - * by the exception entry and exit code to the generic process - * user and system time records. + * Transfer the user time accumulated in the paca + * by the exception entry and exit code to the generic + * process user time records. * Must be called with interrupts disabled. - * Assumes that vtime_account() has been called recently - * (i.e. since the last entry from usermode) so that + * Assumes that vtime_account_system/idle() has been called + * recently (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ -void account_process_tick(struct task_struct *tsk, int user_tick) +void vtime_account_user(struct task_struct *tsk) { cputime_t utime, utimescaled; @@ -375,12 +377,6 @@ void account_process_tick(struct task_struct *tsk, int user_tick) account_user_time(tsk, utime, utimescaled); } -void vtime_task_switch(struct task_struct *prev) -{ - vtime_account(prev); - account_process_tick(prev, 0); -} - #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #define calc_cputime_factors() #endif diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 023d5ae2448..d2ff41370c0 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -14,6 +14,7 @@ #define __ARCH_HAS_VTIME_ACCOUNT +#define __ARCH_HAS_VTIME_TASK_SWITCH /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 79033442789..e84b8b68444 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -112,7 +112,12 @@ void vtime_task_switch(struct task_struct *prev) S390_lowcore.system_timer = ti->system_timer; } -void account_process_tick(struct task_struct *tsk, int user_tick) +/* + * In s390, accounting pending user time also implies + * accounting system time in order to correctly compute + * the stolen time accounting. + */ +void vtime_account_user(struct task_struct *tsk) { if (do_account_vtime(tsk, HARDIRQ_OFFSET)) virt_timer_expire(); @@ -127,6 +132,8 @@ void vtime_account(struct task_struct *tsk) struct thread_info *ti = task_thread_info(tsk); u64 timer, system; + WARN_ON_ONCE(!irqs_disabled()); + timer = S390_lowcore.last_update_timer; S390_lowcore.last_update_timer = get_vtimer(); S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; @@ -140,6 +147,10 @@ void vtime_account(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account); +void vtime_account_system(struct task_struct *tsk) +__attribute__((alias("vtime_account"))); +EXPORT_SYMBOL_GPL(vtime_account_system); + void __kprobes vtime_stop_cpu(void) { struct s390_idle_data *idle = &__get_cpu_var(s390_idle); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ecced9d1898..d91a9556800 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -608,9 +608,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_s390_deliver_pending_interrupts(vcpu); vcpu->arch.sie_block->icptcode = 0; - local_irq_disable(); kvm_guest_enter(); - local_irq_enable(); VCPU_EVENT(vcpu, 6, "entering sie flags %x", atomic_read(&vcpu->arch.sie_block->cpuflags)); trace_kvm_s390_sie_enter(vcpu, @@ -629,9 +627,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); - local_irq_disable(); kvm_guest_exit(); - local_irq_enable(); memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); return rc; diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36ca..d3696708fc1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index cab3da3d094..624ef3f45c8 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,6 +4,7 @@ #include <linux/preempt.h> #include <linux/lockdep.h> #include <linux/ftrace_irq.h> +#include <linux/vtime.h> #include <asm/hardirq.h> /* @@ -129,16 +130,6 @@ extern void synchronize_irq(unsigned int irq); # define synchronize_irq(irq) barrier() #endif -struct task_struct; - -#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) -static inline void vtime_account(struct task_struct *tsk) -{ -} -#else -extern void vtime_account(struct task_struct *tsk); -#endif - #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) static inline void rcu_nmi_enter(void) @@ -162,7 +153,7 @@ extern void rcu_nmi_exit(void); */ #define __irq_enter() \ do { \ - vtime_account(current); \ + vtime_account_irq_enter(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -178,7 +169,7 @@ extern void irq_enter(void); #define __irq_exit() \ do { \ trace_hardirq_exit(); \ - vtime_account(current); \ + vtime_account_irq_exit(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 36d12f0884c..66b70780e91 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -7,6 +7,7 @@ #include <linux/cpumask.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <linux/vtime.h> #include <asm/irq.h> #include <asm/cputime.h> @@ -126,16 +127,16 @@ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t) extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); -extern void account_process_tick(struct task_struct *, int user); -extern void account_steal_ticks(unsigned long ticks); -extern void account_idle_ticks(unsigned long ticks); - #ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void vtime_task_switch(struct task_struct *prev); -extern void vtime_account_system(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); +static inline void account_process_tick(struct task_struct *tsk, int user) +{ + vtime_account_user(tsk); +} #else -static inline void vtime_task_switch(struct task_struct *prev) { } +extern void account_process_tick(struct task_struct *, int user); #endif +extern void account_steal_ticks(unsigned long ticks); +extern void account_idle_ticks(unsigned long ticks); + #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ecc554374e4..d5cddd8dcc5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -726,7 +726,11 @@ static inline int kvm_deassign_device(struct kvm *kvm, static inline void kvm_guest_enter(void) { BUG_ON(preemptible()); - vtime_account(current); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system_irqsafe(current); current->flags |= PF_VCPU; /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -740,7 +744,11 @@ static inline void kvm_guest_enter(void) static inline void kvm_guest_exit(void) { - vtime_account(current); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system_irqsafe(current); current->flags &= ~PF_VCPU; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 29116b853ec..b96ff1e43ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -436,13 +436,28 @@ struct cpu_itimer { }; /** + * struct cputime - snaphsot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * + * Gathers a generic snapshot of user and system time. + */ +struct cputime { + cputime_t utime; + cputime_t stime; +}; + +/** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This structure groups together three kinds of CPU time that are - * tracked for threads and thread groups. Most things considering + * This is an extension of struct cputime that includes the total runtime + * spent by the task from the scheduler point of view. + * + * As a result, this structure groups together three kinds of CPU time + * that are tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three * of them in parallel. */ @@ -583,7 +598,7 @@ struct signal_struct { cputime_t gtime; cputime_t cgtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; @@ -1064,6 +1079,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); @@ -1098,6 +1114,18 @@ struct load_weight { unsigned long weight, inv_weight; }; +struct sched_avg { + /* + * These sums represent an infinite geometric series and so are bound + * above by 1024/(1-y). Thus we only need a u32 to store them for for all + * choices of y < 1-2^(-32)*1024. + */ + u32 runnable_avg_sum, runnable_avg_period; + u64 last_runnable_update; + s64 decay_count; + unsigned long load_avg_contrib; +}; + #ifdef CONFIG_SCHEDSTATS struct sched_statistics { u64 wait_start; @@ -1158,6 +1186,15 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + /* Per-entity load-tracking */ + struct sched_avg avg; +#endif }; struct sched_rt_entity { @@ -1321,7 +1358,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ @@ -1732,8 +1769,8 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); -extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags diff --git a/include/linux/vtime.h b/include/linux/vtime.h new file mode 100644 index 00000000000..ae30ab58431 --- /dev/null +++ b/include/linux/vtime.h @@ -0,0 +1,48 @@ +#ifndef _LINUX_KERNEL_VTIME_H +#define _LINUX_KERNEL_VTIME_H + +struct task_struct; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void vtime_task_switch(struct task_struct *prev); +extern void vtime_account_system(struct task_struct *tsk); +extern void vtime_account_system_irqsafe(struct task_struct *tsk); +extern void vtime_account_idle(struct task_struct *tsk); +extern void vtime_account_user(struct task_struct *tsk); +extern void vtime_account(struct task_struct *tsk); +#else +static inline void vtime_task_switch(struct task_struct *prev) { } +static inline void vtime_account_system(struct task_struct *tsk) { } +static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { } +static inline void vtime_account(struct task_struct *tsk) { } +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +extern void irqtime_account_irq(struct task_struct *tsk); +#else +static inline void irqtime_account_irq(struct task_struct *tsk) { } +#endif + +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + /* + * Hardirq can interrupt idle task anytime. So we need vtime_account() + * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING. + * Softirq can also interrupt idle task directly if it calls + * local_bh_enable(). Such case probably don't exist but we never know. + * Ksoftirqd is not concerned because idle time is flushed on context + * switch. Softirqs in the end of hardirqs are also not a problem because + * the idle time is flushed on hardirq time already. + */ + vtime_account(tsk); + irqtime_account_irq(tsk); +} + +static inline void vtime_account_irq_exit(struct task_struct *tsk) +{ + /* On hard|softirq exit we always account to hard|softirq cputime */ + vtime_account_system(tsk); + irqtime_account_irq(tsk); +} + +#endif /* _LINUX_KERNEL_VTIME_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092..618f7ee5600 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/fork.c b/kernel/fork.c index c497e57aa65..850dde1e0c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1224,7 +1224,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa2..d73840271dc 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { - times->utime += t->utime; - times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: - rcu_read_unlock(); -} - static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { if (b->utime > a->utime) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 15f60d01198..0984a21076a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -143,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 443232ebbb5..8bd04714281 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,6 +4,11 @@ #include <linux/rwsem.h> struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 80f80dfca70..f5066a61f97 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -953,6 +953,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } @@ -1525,6 +1527,15 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a..293b202fcf7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void vtime_account(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(irqtime_account_irq); static int irqtime_account_hi_update(void) { @@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) return false; } +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *st = cputime.stime; } +void vtime_account_system_irqsafe(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + + vtime_account_user(prev); + arch_vtime_task_switch(prev); +} +#endif + /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) #ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account(struct task_struct *tsk) { - unsigned long flags; - - local_irq_save(flags); - if (in_interrupt() || !is_idle_task(tsk)) vtime_account_system(tsk); else vtime_account_idle(tsk); - - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(vtime_account); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + + utime = curr->utime; + total = utime + curr->stime; /* - * Use CFS's precise accounting: + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) utime = rtime; /* - * Compare with previous values, to keep monotonicity: + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); - *ut = p->prev_utime; - *st = p->prev_stime; + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea..2cd3c1b4e58 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { |