aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/ia64/include/asm/cputime.h2
-rw-r--r--arch/ia64/kernel/time.c22
-rw-r--r--arch/powerpc/include/asm/cputime.h2
-rw-r--r--arch/powerpc/kernel/time.c20
-rw-r--r--arch/s390/include/asm/cputime.h1
-rw-r--r--arch/s390/kernel/vtime.c13
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--fs/proc/array.c4
-rw-r--r--include/linux/hardirq.h15
-rw-r--r--include/linux/kernel_stat.h17
-rw-r--r--include/linux/kvm_host.h12
-rw-r--r--include/linux/sched.h49
-rw-r--r--include/linux/vtime.h48
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/posix-cpu-timers.c24
-rw-r--r--kernel/sched/auto_group.c4
-rw-r--r--kernel/sched/auto_group.h5
-rw-r--r--kernel/sched/core.c11
-rw-r--r--kernel/sched/cputime.c131
-rw-r--r--kernel/sched/debug.c36
-rw-r--r--kernel/sched/fair.c914
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/sched.h60
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sys.c6
26 files changed, 1082 insertions, 335 deletions
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 3deac956d32..7fcf7f08ab0 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h
@@ -103,5 +103,7 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
#define cputime64_to_clock_t(__ct) \
cputime_to_clock_t((__force cputime_t)__ct)
+extern void arch_vtime_task_switch(struct task_struct *tsk);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#endif /* __IA64_CPUTIME_H */
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index f6388216080..b1995efbfd2 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -83,7 +83,7 @@ static struct clocksource *itc_clocksource;
extern cputime_t cycle_to_cputime(u64 cyc);
-static void vtime_account_user(struct task_struct *tsk)
+void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_utime;
struct thread_info *ti = task_thread_info(tsk);
@@ -100,18 +100,11 @@ static void vtime_account_user(struct task_struct *tsk)
* accumulated times to the current process, and to prepare accounting on
* the next process.
*/
-void vtime_task_switch(struct task_struct *prev)
+void arch_vtime_task_switch(struct task_struct *prev)
{
struct thread_info *pi = task_thread_info(prev);
struct thread_info *ni = task_thread_info(current);
- if (idle_task(smp_processor_id()) != prev)
- vtime_account_system(prev);
- else
- vtime_account_idle(prev);
-
- vtime_account_user(prev);
-
pi->ac_stamp = ni->ac_stamp;
ni->ac_stime = ni->ac_utime = 0;
}
@@ -126,6 +119,8 @@ static cputime_t vtime_delta(struct task_struct *tsk)
cputime_t delta_stime;
__u64 now;
+ WARN_ON_ONCE(!irqs_disabled());
+
now = ia64_get_itc();
delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
@@ -147,15 +142,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(vtime_delta(tsk));
}
-/*
- * Called from the timer interrupt handler to charge accumulated user time
- * to the current process. Must be called with interrupts disabled.
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
- vtime_account_user(p);
-}
-
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
static irqreturn_t
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 487d46ff68a..483733bd06d 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -228,6 +228,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk)
#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct))
+static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
+
#endif /* __KERNEL__ */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#endif /* __POWERPC_CPUTIME_H */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index ce4cb772dc7..b3b14352b05 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -297,6 +297,8 @@ static u64 vtime_delta(struct task_struct *tsk,
u64 now, nowscaled, deltascaled;
u64 udelta, delta, user_scaled;
+ WARN_ON_ONCE(!irqs_disabled());
+
now = mftb();
nowscaled = read_spurr(now);
get_paca()->system_time += now - get_paca()->starttime;
@@ -355,15 +357,15 @@ void vtime_account_idle(struct task_struct *tsk)
}
/*
- * Transfer the user and system times accumulated in the paca
- * by the exception entry and exit code to the generic process
- * user and system time records.
+ * Transfer the user time accumulated in the paca
+ * by the exception entry and exit code to the generic
+ * process user time records.
* Must be called with interrupts disabled.
- * Assumes that vtime_account() has been called recently
- * (i.e. since the last entry from usermode) so that
+ * Assumes that vtime_account_system/idle() has been called
+ * recently (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
-void account_process_tick(struct task_struct *tsk, int user_tick)
+void vtime_account_user(struct task_struct *tsk)
{
cputime_t utime, utimescaled;
@@ -375,12 +377,6 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
account_user_time(tsk, utime, utimescaled);
}
-void vtime_task_switch(struct task_struct *prev)
-{
- vtime_account(prev);
- account_process_tick(prev, 0);
-}
-
#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
#define calc_cputime_factors()
#endif
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 023d5ae2448..d2ff41370c0 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -14,6 +14,7 @@
#define __ARCH_HAS_VTIME_ACCOUNT
+#define __ARCH_HAS_VTIME_TASK_SWITCH
/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 79033442789..e84b8b68444 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -112,7 +112,12 @@ void vtime_task_switch(struct task_struct *prev)
S390_lowcore.system_timer = ti->system_timer;
}
-void account_process_tick(struct task_struct *tsk, int user_tick)
+/*
+ * In s390, accounting pending user time also implies
+ * accounting system time in order to correctly compute
+ * the stolen time accounting.
+ */
+void vtime_account_user(struct task_struct *tsk)
{
if (do_account_vtime(tsk, HARDIRQ_OFFSET))
virt_timer_expire();
@@ -127,6 +132,8 @@ void vtime_account(struct task_struct *tsk)
struct thread_info *ti = task_thread_info(tsk);
u64 timer, system;
+ WARN_ON_ONCE(!irqs_disabled());
+
timer = S390_lowcore.last_update_timer;
S390_lowcore.last_update_timer = get_vtimer();
S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
@@ -140,6 +147,10 @@ void vtime_account(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_account);
+void vtime_account_system(struct task_struct *tsk)
+__attribute__((alias("vtime_account")));
+EXPORT_SYMBOL_GPL(vtime_account_system);
+
void __kprobes vtime_stop_cpu(void)
{
struct s390_idle_data *idle = &__get_cpu_var(s390_idle);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ecced9d1898..d91a9556800 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -608,9 +608,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
kvm_s390_deliver_pending_interrupts(vcpu);
vcpu->arch.sie_block->icptcode = 0;
- local_irq_disable();
kvm_guest_enter();
- local_irq_enable();
VCPU_EVENT(vcpu, 6, "entering sie flags %x",
atomic_read(&vcpu->arch.sie_block->cpuflags));
trace_kvm_s390_sie_enter(vcpu,
@@ -629,9 +627,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
- local_irq_disable();
kvm_guest_exit();
- local_irq_enable();
memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
return rc;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36ca..d3696708fc1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_times(task, &utime, &stime);
+ thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;
}
@@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_times(task, &utime, &stime);
+ task_cputime_adjusted(task, &utime, &stime);
gtime = task->gtime;
}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index cab3da3d094..624ef3f45c8 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -4,6 +4,7 @@
#include <linux/preempt.h>
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
+#include <linux/vtime.h>
#include <asm/hardirq.h>
/*
@@ -129,16 +130,6 @@ extern void synchronize_irq(unsigned int irq);
# define synchronize_irq(irq) barrier()
#endif
-struct task_struct;
-
-#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
-static inline void vtime_account(struct task_struct *tsk)
-{
-}
-#else
-extern void vtime_account(struct task_struct *tsk);
-#endif
-
#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
static inline void rcu_nmi_enter(void)
@@ -162,7 +153,7 @@ extern void rcu_nmi_exit(void);
*/
#define __irq_enter() \
do { \
- vtime_account(current); \
+ vtime_account_irq_enter(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
} while (0)
@@ -178,7 +169,7 @@ extern void irq_enter(void);
#define __irq_exit() \
do { \
trace_hardirq_exit(); \
- vtime_account(current); \
+ vtime_account_irq_exit(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
} while (0)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 36d12f0884c..66b70780e91 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -7,6 +7,7 @@
#include <linux/cpumask.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/vtime.h>
#include <asm/irq.h>
#include <asm/cputime.h>
@@ -126,16 +127,16 @@ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t)
extern void account_steal_time(cputime_t);
extern void account_idle_time(cputime_t);
-extern void account_process_tick(struct task_struct *, int user);
-extern void account_steal_ticks(unsigned long ticks);
-extern void account_idle_ticks(unsigned long ticks);
-
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void vtime_task_switch(struct task_struct *prev);
-extern void vtime_account_system(struct task_struct *tsk);
-extern void vtime_account_idle(struct task_struct *tsk);
+static inline void account_process_tick(struct task_struct *tsk, int user)
+{
+ vtime_account_user(tsk);
+}
#else
-static inline void vtime_task_switch(struct task_struct *prev) { }
+extern void account_process_tick(struct task_struct *, int user);
#endif
+extern void account_steal_ticks(unsigned long ticks);
+extern void account_idle_ticks(unsigned long ticks);
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ecc554374e4..d5cddd8dcc5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -726,7 +726,11 @@ static inline int kvm_deassign_device(struct kvm *kvm,
static inline void kvm_guest_enter(void)
{
BUG_ON(preemptible());
- vtime_account(current);
+ /*
+ * This is running in ioctl context so we can avoid
+ * the call to vtime_account() with its unnecessary idle check.
+ */
+ vtime_account_system_irqsafe(current);
current->flags |= PF_VCPU;
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
@@ -740,7 +744,11 @@ static inline void kvm_guest_enter(void)
static inline void kvm_guest_exit(void)
{
- vtime_account(current);
+ /*
+ * This is running in ioctl context so we can avoid
+ * the call to vtime_account() with its unnecessary idle check.
+ */
+ vtime_account_system_irqsafe(current);
current->flags &= ~PF_VCPU;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29116b853ec..b96ff1e43ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -436,13 +436,28 @@ struct cpu_itimer {
};
/**
+ * struct cputime - snaphsot of system and user cputime
+ * @utime: time spent in user mode
+ * @stime: time spent in system mode
+ *
+ * Gathers a generic snapshot of user and system time.
+ */
+struct cputime {
+ cputime_t utime;
+ cputime_t stime;
+};
+
+/**
* struct task_cputime - collected CPU time counts
* @utime: time spent in user mode, in &cputime_t units
* @stime: time spent in kernel mode, in &cputime_t units
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
*
- * This structure groups together three kinds of CPU time that are
- * tracked for threads and thread groups. Most things considering
+ * This is an extension of struct cputime that includes the total runtime
+ * spent by the task from the scheduler point of view.
+ *
+ * As a result, this structure groups together three kinds of CPU time
+ * that are tracked for threads and thread groups. Most things considering
* CPU time want to group these counts together and treat all three
* of them in parallel.
*/
@@ -583,7 +598,7 @@ struct signal_struct {
cputime_t gtime;
cputime_t cgtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- cputime_t prev_utime, prev_stime;
+ struct cputime prev_cputime;
#endif
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
@@ -1064,6 +1079,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
@@ -1098,6 +1114,18 @@ struct load_weight {
unsigned long weight, inv_weight;
};
+struct sched_avg {
+ /*
+ * These sums represent an infinite geometric series and so are bound
+ * above by 1024/(1-y). Thus we only need a u32 to store them for for all
+ * choices of y < 1-2^(-32)*1024.
+ */
+ u32 runnable_avg_sum, runnable_avg_period;
+ u64 last_runnable_update;
+ s64 decay_count;
+ unsigned long load_avg_contrib;
+};
+
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1158,6 +1186,15 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ /* Per-entity load-tracking */
+ struct sched_avg avg;
+#endif
};
struct sched_rt_entity {
@@ -1321,7 +1358,7 @@ struct task_struct {
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- cputime_t prev_utime, prev_stime;
+ struct cputime prev_cputime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
@@ -1732,8 +1769,8 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}
-extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
-extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
/*
* Per process flags
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
new file mode 100644
index 00000000000..ae30ab58431
--- /dev/null
+++ b/include/linux/vtime.h
@@ -0,0 +1,48 @@
+#ifndef _LINUX_KERNEL_VTIME_H
+#define _LINUX_KERNEL_VTIME_H
+
+struct task_struct;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+extern void vtime_task_switch(struct task_struct *prev);
+extern void vtime_account_system(struct task_struct *tsk);
+extern void vtime_account_system_irqsafe(struct task_struct *tsk);
+extern void vtime_account_idle(struct task_struct *tsk);
+extern void vtime_account_user(struct task_struct *tsk);
+extern void vtime_account(struct task_struct *tsk);
+#else
+static inline void vtime_task_switch(struct task_struct *prev) { }
+static inline void vtime_account_system(struct task_struct *tsk) { }
+static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { }
+static inline void vtime_account(struct task_struct *tsk) { }
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+extern void irqtime_account_irq(struct task_struct *tsk);
+#else
+static inline void irqtime_account_irq(struct task_struct *tsk) { }
+#endif
+
+static inline void vtime_account_irq_enter(struct task_struct *tsk)
+{
+ /*
+ * Hardirq can interrupt idle task anytime. So we need vtime_account()
+ * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING.
+ * Softirq can also interrupt idle task directly if it calls
+ * local_bh_enable(). Such case probably don't exist but we never know.
+ * Ksoftirqd is not concerned because idle time is flushed on context
+ * switch. Softirqs in the end of hardirqs are also not a problem because
+ * the idle time is flushed on hardirq time already.
+ */
+ vtime_account(tsk);
+ irqtime_account_irq(tsk);
+}
+
+static inline void vtime_account_irq_exit(struct task_struct *tsk)
+{
+ /* On hard|softirq exit we always account to hard|softirq cputime */
+ vtime_account_system(tsk);
+ irqtime_account_irq(tsk);
+}
+
+#endif /* _LINUX_KERNEL_VTIME_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 346616c0092..618f7ee5600 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_times() to get times for the thread
+ * We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index c497e57aa65..850dde1e0c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1224,7 +1224,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = p->prev_stime = 0;
+ p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa2..d73840271dc 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct signal_struct *sig = tsk->signal;
- struct task_struct *t;
-
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
-
- rcu_read_lock();
- /* make sure we can trust tsk->thread_group list */
- if (!likely(pid_alive(tsk)))
- goto out;
-
- t = tsk;
- do {
- times->utime += t->utime;
- times->stime += t->stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- } while_each_thread(tsk, t);
-out:
- rcu_read_unlock();
-}
-
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
{
if (b->utime > a->utime)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 15f60d01198..0984a21076a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
+ if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ goto out;
+
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
+out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 443232ebbb5..8bd04714281 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -4,6 +4,11 @@
#include <linux/rwsem.h>
struct autogroup {
+ /*
+ * reference doesn't mean how many thread attach to this
+ * autogroup now. It just stands for the number of task
+ * could use this autogroup.
+ */
struct kref kref;
struct task_group *tg;
struct rw_semaphore lock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 80f80dfca70..f5066a61f97 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -953,6 +953,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1525,6 +1527,15 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 81b763ba58a..293b202fcf7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void vtime_account(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr)
{
unsigned long flags;
s64 delta;
@@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr)
irq_time_write_end();
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
static int irqtime_account_hi_update(void)
{
@@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void)
return false;
}
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct task_struct *t;
+
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ rcu_read_lock();
+ /* make sure we can trust tsk->thread_group list */
+ if (!likely(pid_alive(tsk)))
+ goto out;
+
+ t = tsk;
+ do {
+ times->utime += t->utime;
+ times->stime += t->stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ } while_each_thread(tsk, t);
+out:
+ rcu_read_unlock();
+}
+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -417,13 +445,13 @@ void account_idle_ticks(unsigned long ticks)
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
*ut = p->utime;
*st = p->stime;
}
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime;
@@ -433,6 +461,29 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*st = cputime.stime;
}
+void vtime_account_system_irqsafe(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ vtime_account_system(tsk);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+ vtime_account_user(prev);
+ arch_vtime_task_switch(prev);
+}
+#endif
+
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
@@ -444,16 +495,10 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
#ifndef __ARCH_HAS_VTIME_ACCOUNT
void vtime_account(struct task_struct *tsk)
{
- unsigned long flags;
-
- local_irq_save(flags);
-
if (in_interrupt() || !is_idle_task(tsk))
vtime_account_system(tsk);
else
vtime_account_idle(tsk);
-
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(vtime_account);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -478,14 +523,30 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
return (__force cputime_t) temp;
}
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+ struct cputime *prev,
+ cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, utime = p->utime, total = utime + p->stime;
+ cputime_t rtime, utime, total;
+
+ utime = curr->utime;
+ total = utime + curr->stime;
/*
- * Use CFS's precise accounting:
+ * Tick based cputime accounting depend on random scheduling
+ * timeslices of a task to be interrupted or not by the timer.
+ * Depending on these circumstances, the number of these interrupts
+ * may be over or under-optimistic, matching the real user and system
+ * cputime with a variable precision.
+ *
+ * Fix this by scaling these tick based values against the total
+ * runtime accounted by the CFS scheduler.
*/
- rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+ rtime = nsecs_to_cputime(curr->sum_exec_runtime);
if (total)
utime = scale_utime(utime, rtime, total);
@@ -493,38 +554,36 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
utime = rtime;
/*
- * Compare with previous values, to keep monotonicity:
+ * If the tick based count grows faster than the scheduler one,
+ * the result of the scaling may go backward.
+ * Let's enforce monotonicity.
*/
- p->prev_utime = max(p->prev_utime, utime);
- p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+ prev->utime = max(prev->utime, utime);
+ prev->stime = max(prev->stime, rtime - prev->utime);
- *ut = p->prev_utime;
- *st = p->prev_stime;
+ *ut = prev->utime;
+ *st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime = {
+ .utime = p->utime,
+ .stime = p->stime,
+ .sum_exec_runtime = p->se.sum_exec_runtime,
+ };
+
+ cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
/*
* Must be called with siglock held.
*/
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
- struct signal_struct *sig = p->signal;
struct task_cputime cputime;
- cputime_t rtime, utime, total;
thread_group_cputime(p, &cputime);
-
- total = cputime.utime + cputime.stime;
- rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-
- if (total)
- utime = scale_utime(cputime.utime, rtime, total);
- else
- utime = rtime;
-
- sig->prev_utime = max(sig->prev_utime, utime);
- sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
-
- *ut = sig->prev_utime;
- *st = sig->prev_stime;
+ cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea..2cd3c1b4e58 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{