aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/cputime.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/cputime.c')
-rw-r--r--kernel/sched/cputime.c331
1 files changed, 176 insertions, 155 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f..72fdf06ef86 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,31 +115,15 @@ static int irqtime_account_si_update(void)
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
/*
* Since all updates are sure to touch the root cgroup, we
* get ourselves ahead and touch it first. If the root cgroup
* is the only cgroup, then nothing else should be necessary.
*
*/
- __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
+ __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
-#endif
+ cpuacct_account_field(p, index, tmp);
}
/*
@@ -158,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
task_group_account_field(p, index, (__force u64) cputime);
@@ -185,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
p->gtime += cputime;
/* Add guest time to cpustat. */
- if (TASK_NICE(p) > 0) {
+ if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += (__force u64) cputime;
cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
} else {
@@ -274,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
- u64 steal, st = 0;
+ u64 steal;
+ cputime_t steal_ct;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
- st = steal_ticks(steal);
- this_rq()->prev_steal_time += st * TICK_NSEC;
+ /*
+ * cputime_t may be less precise than nsecs (eg: if it's
+ * based on jiffies). Lets cast the result to cputime
+ * granularity and account the rest on the next rounds.
+ */
+ steal_ct = nsecs_to_cputime(steal);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
- account_steal_time(st);
- return st;
+ account_steal_time(steal_ct);
+ return steal_ct;
}
#endif
return false;
@@ -310,7 +300,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- task_cputime(tsk, &utime, &stime);
+ task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
@@ -342,53 +332,124 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_common_account_irq_enter(struct task_struct *tsk)
+{
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
@@ -403,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
@@ -443,96 +504,49 @@ void account_idle_ticks(unsigned long ticks)
account_idle_time(jiffies_to_cputime(ticks));
}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
/*
- * Use precise platform statistics if available:
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
{
- *ut = p->utime;
- *st = p->stime;
-}
+ u64 scaled;
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime)
+ swap(rtime, stime);
- thread_group_cputime(p, &cputime);
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
- *ut = cputime.utime;
- *st = cputime.stime;
-}
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (!vtime_accounting_enabled())
- return;
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_system(prev);
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- vtime_account_user(prev);
-#endif
- arch_vtime_task_switch(prev);
-}
-#endif
-
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
-{
- if (!vtime_accounting_enabled())
- return;
-
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
- }
-
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
}
- vtime_account_system(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-
-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
-{
- u64 temp = (__force u64) rtime;
- temp *= (__force u64) stime;
-
- if (sizeof(cputime_t) == 4)
- temp = div_u64(temp, (__force u32) total);
- else
- temp = div64_u64(temp, (__force u64) total);
-
- return (__force cputime_t) temp;
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
}
/*
@@ -543,10 +557,7 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, stime, total;
-
- stime = curr->stime;
- total = stime + curr->utime;
+ cputime_t rtime, stime, utime;
/*
* Tick based cputime accounting depend on random scheduling
@@ -560,10 +571,28 @@ static void cputime_adjust(struct task_cputime *curr,
*/
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- if (total)
- stime = scale_stime(stime, rtime, total);
- else
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ stime = curr->stime;
+ utime = curr->utime;
+
+ if (utime == 0) {
stime = rtime;
+ } else if (stime == 0) {
+ utime = rtime;
+ } else {
+ cputime_t total = stime + utime;
+
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ }
/*
* If the tick based count grows faster than the scheduler one,
@@ -571,8 +600,9 @@ static void cputime_adjust(struct task_cputime *curr,
* Let's enforce monotonicity.
*/
prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, rtime - prev->stime);
+ prev->utime = max(prev->utime, utime);
+out:
*ut = prev->utime;
*st = prev->stime;
}
@@ -597,7 +627,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
thread_group_cputime(p, &cputime);
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static unsigned long long vtime_delta(struct task_struct *tsk)
@@ -631,23 +661,17 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- __vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock);
}
@@ -655,12 +679,8 @@ void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- if (!vtime_accounting_enabled())
- return;
-
- delta_cpu = get_vtime_delta(tsk);
-
write_seqlock(&tsk->vtime_seqlock);
+ delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
write_sequnlock(&tsk->vtime_seqlock);
@@ -668,22 +688,27 @@ void vtime_account_user(struct task_struct *tsk)
void vtime_user_enter(struct task_struct *tsk)
{
- if (!vtime_accounting_enabled())
- return;
-
write_seqlock(&tsk->vtime_seqlock);
- tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk);
+ tsk->vtime_snap_whence = VTIME_USER;
write_sequnlock(&tsk->vtime_seqlock);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_snap flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
@@ -692,6 +717,7 @@ void vtime_guest_exit(struct task_struct *tsk)
current->flags &= ~PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock);
}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
@@ -700,11 +726,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(delta_cpu);
}
-bool vtime_accounting_enabled(void)
-{
- return context_tracking_active();
-}
-
void arch_vtime_task_switch(struct task_struct *prev)
{
write_seqlock(&prev->vtime_seqlock);
@@ -713,17 +734,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
write_seqlock(&current->vtime_seqlock);
current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = sched_clock();
+ current->vtime_snap = sched_clock_cpu(smp_processor_id());
write_sequnlock(&current->vtime_seqlock);
}
-void vtime_init_idle(struct task_struct *t)
+void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
write_seqlock_irqsave(&t->vtime_seqlock, flags);
t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = sched_clock();
+ t->vtime_snap = sched_clock_cpu(cpu);
write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
}