aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile9
-rw-r--r--kernel/sched/auto_group.c14
-rw-r--r--kernel/sched/clock.c136
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c5003
-rw-r--r--kernel/sched/cpuacct.c283
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.c229
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/cpupri.c32
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c838
-rw-r--r--kernel/sched/deadline.c1676
-rw-r--r--kernel/sched/debug.c255
-rw-r--r--kernel/sched/fair.c5041
-rw-r--r--kernel/sched/features.h53
-rw-r--r--kernel/sched/idle.c273
-rw-r--r--kernel/sched/idle_task.c13
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c554
-rw-r--r--kernel/sched/sched.h725
-rw-r--r--kernel/sched/stats.c80
-rw-r--r--kernel/sched/stats.h90
-rw-r--r--kernel/sched/stop_task.c39
-rw-r--r--kernel/sched/wait.c504
25 files changed, 12251 insertions, 4538 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a..ab32b7b0db5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,10 +11,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
-obj-$(CONFIG_SMP) += cpupri.o
+obj-y += core.o proc.o clock.o cputime.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
+obj-y += wait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
-
-
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e8a1f83ee0e..e73efba9830 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
+ sched_offline_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -95,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
#endif
tg->autogroup = ag;
+ sched_online_group(tg, &root_task_group);
return ag;
out_free:
@@ -195,20 +197,20 @@ __setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
-int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
int err;
- if (*nice < -20 || *nice > 19)
+ if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
- err = security_task_setnice(current, *nice);
+ err = security_task_setnice(current, nice);
if (err)
return err;
- if (*nice < 0 && !can_nice(current, *nice))
+ if (nice < 0 && !can_nice(current, nice))
return -EPERM;
/* this is a heavy operation taking global locks.. */
@@ -219,9 +221,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
ag = autogroup_task_get(p);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+ err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
if (!err)
- ag->nice = *nice;
+ ag->nice = nice;
up_write(&ag->lock);
autogroup_kref_put(ag);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492d..3ef6451e972 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
- * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
* local_clock() -- is cpu_clock() on the current cpu.
*
+ * sched_clock_cpu(i)
+ *
* How:
*
* The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
- *
- * Notes:
- *
- * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
- * like cpufreq interrupts that can change the base clock (TSC) multiplier
- * and cause funny jumps in time -- although the filtering provided by
- * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
- * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
- * sched_clock().
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
@@ -66,13 +58,16 @@
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
+#include <linux/static_key.h>
+#include <linux/workqueue.h>
+#include <linux/compiler.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __attribute__((weak)) sched_clock(void)
+unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -82,7 +77,52 @@ EXPORT_SYMBOL_GPL(sched_clock);
__read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__read_mostly int sched_clock_stable;
+static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+static int __sched_clock_stable_early;
+
+int sched_clock_stable(void)
+{
+ return static_key_false(&__sched_clock_stable);
+}
+
+static void __set_sched_clock_stable(void)
+{
+ if (!sched_clock_stable())
+ static_key_slow_inc(&__sched_clock_stable);
+}
+
+void set_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 1;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ __set_sched_clock_stable();
+}
+
+static void __clear_sched_clock_stable(struct work_struct *work)
+{
+ /* XXX worry about clock continuity */
+ if (sched_clock_stable())
+ static_key_slow_dec(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+
+void clear_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 0;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ schedule_work(&sched_clock_work);
+}
struct sched_clock_data {
u64 tick_raw;
@@ -116,6 +156,20 @@ void sched_clock_init(void)
}
sched_clock_running = 1;
+
+ /*
+ * Ensure that it is impossible to not do a static_key update.
+ *
+ * Either {set,clear}_sched_clock_stable() must see sched_clock_running
+ * and do the update, or we must see their __sched_clock_stable_early
+ * and do the update, or both.
+ */
+ smp_mb(); /* matches {set,clear}_sched_clock_stable() */
+
+ if (__sched_clock_stable_early)
+ __set_sched_clock_stable();
+ else
+ __clear_sched_clock_stable(NULL);
}
/*
@@ -176,10 +230,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
@@ -216,20 +296,20 @@ u64 sched_clock_cpu(int cpu)
struct sched_clock_data *scd;
u64 clock;
- WARN_ON_ONCE(!irqs_disabled());
-
- if (sched_clock_stable)
+ if (sched_clock_stable())
return sched_clock();
if (unlikely(!sched_clock_running))
return 0ull;
+ preempt_disable_notrace();
scd = cpu_sdc(cpu);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
+ preempt_enable_notrace();
return clock;
}
@@ -239,7 +319,7 @@ void sched_clock_tick(void)
struct sched_clock_data *scd;
u64 now, now_gtod;
- if (sched_clock_stable)
+ if (sched_clock_stable())
return;
if (unlikely(!sched_clock_running))
@@ -290,14 +370,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
*/
u64 cpu_clock(int cpu)
{
- u64 clock;
- unsigned long flags;
+ if (!sched_clock_stable())
+ return sched_clock_cpu(cpu);
- local_irq_save(flags);
- clock = sched_clock_cpu(cpu);
- local_irq_restore(flags);
-
- return clock;
+ return sched_clock();
}
/*
@@ -309,14 +385,10 @@ u64 cpu_clock(int cpu)
*/
u64 local_clock(void)
{
- u64 clock;
- unsigned long flags;
-
- local_irq_save(flags);
- clock = sched_clock_cpu(smp_processor_id());
- local_irq_restore(flags);
+ if (!sched_clock_stable())
+ return sched_clock_cpu(raw_smp_processor_id());
- return clock;
+ return sched_clock();
}
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -336,12 +408,12 @@ u64 sched_clock_cpu(int cpu)
u64 cpu_clock(int cpu)
{
- return sched_clock_cpu(cpu);
+ return sched_clock();
}
u64 local_clock(void)
{
- return sched_clock_cpu(0);
+ return sched_clock();
}
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 00000000000..a63f4dc2790
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e05..bc1638b3344 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,11 @@
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/compiler.h>
+#include <asm/switch_to.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
@@ -80,11 +84,28 @@
#endif
#include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+ smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
+
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -139,9 +160,8 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
#include "features.h"
- NULL
};
#undef SCHED_FEAT
@@ -162,13 +182,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
#ifdef HAVE_JUMP_LABEL
-#define jump_label_key__true jump_label_key_enabled
-#define jump_label_key__false jump_label_key_disabled
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
#define SCHED_FEAT(name, enabled) \
jump_label_key__##enabled ,
-struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h"
};
@@ -176,37 +196,24 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- if (jump_label_enabled(&sched_feat_keys[i]))
- jump_label_dec(&sched_feat_keys[i]);
+ if (static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_dec(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- if (!jump_label_enabled(&sched_feat_keys[i]))
- jump_label_inc(&sched_feat_keys[i]);
+ if (!static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_inc(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_set(char *cmp)
{
- char buf[64];
- char *cmp;
- int neg = 0;
int i;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
@@ -226,6 +233,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
}
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ i = sched_feat_set(cmp);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -285,8 +313,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -359,13 +385,6 @@ static struct rq *this_rq_lock(void)
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
*/
static void hrtick_clear(struct rq *rq)
@@ -393,6 +412,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
}
#ifdef CONFIG_SMP
+
+static int __hrtick_restart(struct rq *rq)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = hrtimer_get_softexpires(timer);
+
+ return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
+
/*
* called from hardirq (IPI) context
*/
@@ -401,7 +429,7 @@ static void __hrtick_start(void *arg)
struct rq *rq = arg;
raw_spin_lock(&rq->lock);
- hrtimer_restart(&rq->hrtick_timer);
+ __hrtick_restart(rq);
rq->hrtick_csd_pending = 0;
raw_spin_unlock(&rq->lock);
}
@@ -419,9 +447,9 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
- hrtimer_restart(timer);
+ __hrtick_restart(rq);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
rq->hrtick_csd_pending = 1;
}
}
@@ -494,37 +522,98 @@ static inline void init_hrtick(void)
#endif /* CONFIG_SCHED_HRTICK */
/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+/*
* resched_task - mark a task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
void resched_task(struct task_struct *p)
{
int cpu;
- assert_raw_spin_locked(&task_rq(p)->lock);
+ lockdep_assert_held(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
- set_tsk_need_resched(p);
-
cpu = task_cpu(p);
- if (cpu == smp_processor_id())
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
return;
+ }
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -538,7 +627,8 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
@@ -547,12 +637,15 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(void)
+int get_nohz_timer_target(int pinned)
{
int cpu = smp_processor_id();
int i;
struct sched_domain *sd;
+ if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ return cpu;
+
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
@@ -576,56 +669,87 @@ unlock:
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id())
return;
- /*
- * This is safe, as this function is called with the timer
- * wheel base lock of (cpu) held. When the CPU is on the way
- * to idle and has not yet set rq->curr to idle then it will
- * be serialized on the timer wheel base lock and take the new
- * timer into account automatically.
- */
- if (rq->curr != rq->idle)
- return;
+ if (set_nr_and_not_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ smp_send_reschedule(cpu);
+ return true;
+ }
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
- smp_send_reschedule(cpu);
+ return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
}
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
- return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+ if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ return false;
+
+ if (idle_cpu(cpu) && !need_resched())
+ return true;
+
+ /*
+ * We can't run Idle Load Balance on this CPU for this time so we
+ * cancel it and clear NOHZ_BALANCE_KICK
+ */
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ return false;
}
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
static inline bool got_nohz_idle_kick(void)
{
return false;
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+ struct rq *rq;
+
+ rq = this_rq();
+
+ /* Make sure rq->nr_running update is visible after the IPI */
+ smp_rmb();
+
+ /* More than one running task need preemption */
+ if (rq->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
- while ((s64)(rq->clock - rq->age_stamp) > period) {
+ while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
@@ -637,12 +761,6 @@ void sched_avg_update(struct rq *rq)
}
}
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
- assert_raw_spin_locked(&task_rq(p)->lock);
- set_tsk_need_resched(p);
-}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -690,8 +808,6 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-void update_cpu_load(struct rq *this_rq);
-
static void set_load_weight(struct task_struct *p)
{
int prio = p->static_prio - MAX_RT_PRIO;
@@ -713,14 +829,14 @@ static void set_load_weight(struct task_struct *p)
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(p);
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(p);
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -740,126 +856,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- u64 irq_time;
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void account_system_vtime(struct task_struct *curr)
-{
- unsigned long flags;
- s64 delta;
- int cpu;
-
- if (!sched_clock_irqtime)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
-
- irq_time_write_begin();
- /*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
- */
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
-
- irq_time_write_end();
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(account_system_vtime);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
- if (unlikely(steal > NSEC_PER_SEC))
- return div_u64(steal, TICK_NSEC);
-
- return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
@@ -894,20 +890,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_branch((&paravirt_steal_rq_enabled))) {
- u64 st;
-
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- st = steal_ticks(steal);
- steal = st * TICK_NSEC;
-
rq->prev_steal_time_rq += steal;
-
delta -= steal;
}
#endif
@@ -915,48 +905,11 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static int irqtime_account_hi_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
-
-static int irqtime_account_si_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
-
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime (0)
-
-#endif
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1006,7 +959,9 @@ static inline int normal_prio(struct task_struct *p)
{
int prio;
- if (task_has_rt_policy(p))
+ if (task_has_dl_policy(p))
+ prio = MAX_DL_PRIO-1;
+ else if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
@@ -1036,6 +991,8 @@ static int effective_prio(struct task_struct *p)
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
@@ -1050,7 +1007,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio)
+ } else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
@@ -1088,7 +1045,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+ !(task_preempt_count(p) & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
/*
@@ -1096,7 +1053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -1109,6 +1066,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
@@ -1116,6 +1075,108 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+ if (p->on_rq) {
+ struct rq *src_rq, *dst_rq;
+
+ src_rq = task_rq(p);
+ dst_rq = cpu_rq(cpu);
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu);
+ activate_task(dst_rq, p, 0);
+ check_preempt_curr(dst_rq, p, 0);
+ } else {
+ /*
+ * Task isn't running anymore; make it appear like we migrated
+ * it before it went to sleep. This means on wakeup we make the
+ * previous cpu our targer instead of where it really is.
+ */
+ p->wake_cpu = cpu;
+ }
+}
+
+struct migration_swap_arg {
+ struct task_struct *src_task, *dst_task;
+ int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+ struct migration_swap_arg *arg = data;
+ struct rq *src_rq, *dst_rq;
+ int ret = -EAGAIN;
+
+ src_rq = cpu_rq(arg->src_cpu);
+ dst_rq = cpu_rq(arg->dst_cpu);
+
+ double_raw_lock(&arg->src_task->pi_lock,
+ &arg->dst_task->pi_lock);
+ double_rq_lock(src_rq, dst_rq);
+ if (task_cpu(arg->dst_task) != arg->dst_cpu)
+ goto unlock;
+
+ if (task_cpu(arg->src_task) != arg->src_cpu)
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ goto unlock;
+
+ __migrate_swap_task(arg->src_task, arg->dst_cpu);
+ __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+ ret = 0;
+
+unlock:
+ double_rq_unlock(src_rq, dst_rq);
+ raw_spin_unlock(&arg->dst_task->pi_lock);
+ raw_spin_unlock(&arg->src_task->pi_lock);
+
+ return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+ struct migration_swap_arg arg;
+ int ret = -EINVAL;
+
+ arg = (struct migration_swap_arg){
+ .src_task = cur,
+ .src_cpu = task_cpu(cur),
+ .dst_task = p,
+ .dst_cpu = task_cpu(p),
+ };
+
+ if (arg.src_cpu == arg.dst_cpu)
+ goto out;
+
+ /*
+ * These three tests are all lockless; this is OK since all of them
+ * will be re-checked with proper locks held further down the line.
+ */
+ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ goto out;
+
+ trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
+ ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+ return ret;
+}
+
struct migration_arg {
struct task_struct *task;
int dest_cpu;
@@ -1263,29 +1324,69 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
+ enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
-
- /* Look for allowed, online CPU in same node. */
- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return dest_cpu;
- /* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
- if (dest_cpu < nr_cpu_ids)
- return dest_cpu;
-
- /* No more Mr. Nice Guy. */
- dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
*/
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
+ }
+
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ goto out;
+ }
+
+ switch (state) {
+ case cpuset:
+ /* No more Mr. Nice Guy. */
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_deferred("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
}
return dest_cpu;
@@ -1295,9 +1396,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1379,8 +1480,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
+ trace_sched_wakeup(p, true);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -1388,13 +1489,14 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
p->sched_class->task_woken(rq, p);
if (rq->idle_stamp) {
- u64 delta = rq->clock - rq->idle_stamp;
- u64 max = 2*sysctl_sched_migration_cost;
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
- if (delta > max)
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
rq->avg_idle = max;
- else
- update_avg(&rq->avg_idle, delta);
+
rq->idle_stamp = 0;
}
#endif
@@ -1425,6 +1527,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p);
if (p->on_rq) {
+ /* check_preempt_curr() may use rq clock */
+ update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
ret = 1;
}
@@ -1434,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
- raw_spin_lock(&rq->lock);
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1448,12 +1556,21 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+
+ if (llist_empty(&this_rq()->wake_list)
+ && !tick_nohz_full_cpu(smp_processor_id())
+ && !got_nohz_idle_kick())
return;
/*
@@ -1470,12 +1587,13 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
+ tick_nohz_full_check();
sched_ttwu_pending();
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
- if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+ if (unlikely(got_nohz_idle_kick())) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
@@ -1484,30 +1602,17 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
-}
-
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
-{
- struct rq *rq;
- int ret = 0;
+ struct rq *rq = cpu_rq(cpu);
- rq = __task_rq_lock(p);
- if (p->on_cpu) {
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- ttwu_do_wakeup(rq, p, wake_flags);
- ret = 1;
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
- __task_rq_unlock(rq);
-
- return ret;
-
}
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
@@ -1518,7 +1623,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+ if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
@@ -1542,7 +1647,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
@@ -1551,7 +1656,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
@@ -1567,21 +1678,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*/
- while (p->on_cpu) {
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- /*
- * In case the architecture enables interrupts in
- * context_switch(), we cannot busy wait, since that
- * would lead to deadlocks when an interrupt hits and
- * tries to wake up @prev. So bail and do a complete
- * remote wakeup.
- */
- if (ttwu_activate_remote(p, wake_flags))
- goto stat;
-#else
+ while (p->on_cpu)
cpu_relax();
-#endif
- }
/*
* Pairs with the smp_wmb() in finish_lock_switch().
*/
@@ -1593,7 +1691,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1621,8 +1719,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1648,15 +1748,17 @@ out:
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
- * processes. Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -1671,7 +1773,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*
* __sched_fork() is basic setup used by init_idle() too:
*/
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -1687,22 +1789,94 @@ static void __sched_fork(struct task_struct *p)
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
+ RB_CLEAR_NODE(&p->dl.rb_node);
+ hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ p->dl.dl_runtime = p->dl.runtime = 0;
+ p->dl.dl_deadline = p->dl.deadline = 0;
+ p->dl.dl_period = 0;
+ p->dl.flags = 0;
+
INIT_LIST_HEAD(&p->rt.run_list);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ p->mm->numa_scan_seq = 0;
+ }
+
+ if (clone_flags & CLONE_VM)
+ p->numa_preferred_nid = current->numa_preferred_nid;
+ else
+ p->numa_preferred_nid = -1;
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ INIT_LIST_HEAD(&p->numa_entry);
+ p->numa_group = NULL;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = numabalancing_enabled;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_numabalancing_state(state);
+ return err;
+}
+#endif
+#endif
/*
* fork()/clone()-time setup:
*/
-void sched_fork(struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
- __sched_fork(p);
+ __sched_fork(clone_flags, p);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@@ -1719,7 +1893,7 @@ void sched_fork(struct task_struct *p)
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
- if (task_has_rt_policy(p)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
@@ -1736,8 +1910,14 @@ void sched_fork(struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- if (!rt_prio(p->prio))
+ if (dl_prio(p->prio)) {
+ put_cpu();
+ return -EAGAIN;
+ } else if (rt_prio(p->prio)) {
+ p->sched_class = &rt_sched_class;
+ } else {
p->sched_class = &fair_sched_class;
+ }
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
@@ -1760,18 +1940,128 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT_COUNT
- /* Want to start with kernel preemption disabled. */
- task_thread_info(p)->preempt_count = 1;
-#endif
+ init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
put_cpu();
+ return 0;
+}
+
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 20;
+
+ /*
+ * Doing this here saves a lot of checks in all
+ * the calling paths, and returning zero seems
+ * safe for them anyway.
+ */
+ if (period == 0)
+ return 0;
+
+ return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ if (new_bw == p->dl.dl_bw)
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
+/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
@@ -1790,9 +2080,11 @@ void wake_up_new_task(struct task_struct *p)
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
- set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
+ /* Initialize new task's runnable average */
+ init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = 1;
@@ -1832,9 +2124,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
@@ -1843,9 +2134,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
@@ -1880,12 +2170,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- sched_info_switch(prev, next);
+ trace_sched_switch(prev, next);
+ sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
- trace_sched_switch(prev, next);
}
/**
@@ -1923,21 +2213,19 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_state = prev->state;
+ vtime_task_switch(prev);
finish_arch_switch(prev);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_disable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
perf_event_task_sched_in(prev, current);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_enable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
- trace_sched_stat_sleeptime(current, rq->clock);
+ finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -1945,17 +2233,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
kprobe_flush_task(prev);
put_task_struct(prev);
}
+
+ tick_nohz_task_switch(current);
}
#ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-}
-
/* rq->lock is NOT held, but preemption is disabled */
static inline void post_schedule(struct rq *rq)
{
@@ -1973,10 +2256,6 @@ static inline void post_schedule(struct rq *rq)
#else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
static inline void post_schedule(struct rq *rq)
{
}
@@ -1987,7 +2266,7 @@ static inline void post_schedule(struct rq *rq)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
@@ -2050,6 +2329,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
+ context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -2063,11 +2343,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -2079,23 +2358,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
@@ -2123,385 +2385,6 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
-unsigned long this_cpu_load(void)
-{
- struct rq *this = this_rq();
- return this->cpu_load[0];
-}
-
-
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-
-static long calc_load_fold_active(struct rq *this_rq)
-{
- long nr_active, delta = 0;
-
- nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
-
- if (nr_active != this_rq->calc_load_active) {
- delta = nr_active - this_rq->calc_load_active;
- this_rq->calc_load_active = nr_active;
- }
-
- return delta;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_tasks_idle;
-
-void calc_load_account_idle(struct rq *this_rq)
-{
- long delta;
-
- delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
-}
-
-static long calc_load_fold_idle(void)
-{
- long delta = 0;
-
- /*
- * Its got a race, we don't care...
- */
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
-
- return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
- unsigned long result = 1UL << frac_bits;
-
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
-
- return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- * = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- * ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- * = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- * n 1 - x^(n+1)
- * S_n := \Sum x^i = -------------
- * i=0 1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
- unsigned long active, unsigned int n)
-{
-
- return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(unsigned long ticks)
-{
- long delta, active, n;
-
- if (time_before(jiffies, calc_load_update))
- return;
-
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * If we were idle for multiple load cycles, apply them.
- */
- if (ticks >= LOAD_FREQ) {
- n = ticks / LOAD_FREQ;
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
- calc_load_update += n * LOAD_FREQ;
- }
-
- /*
- * Its possible the remainder of the above division also crosses
- * a LOAD_FREQ period, the regular check in calc_global_load()
- * which comes after this will take care of that.
- *
- * Consider us being 11 ticks before a cycle completion, and us
- * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
- * age us 4 cycles, and the test in calc_global_load() will
- * pick up the final one.
- */
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-
-static inline long calc_load_fold_idle(void)
-{
- return 0;
-}
-
-static void calc_global_nohz(unsigned long ticks)
-{
-}
-#endif
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
- long active;
-
- calc_global_nohz(ticks);
-
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load(avenrun[0], EXP_1, active);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
- calc_load_update += LOAD_FREQ;
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
- long delta;
-
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-void update_cpu_load(struct rq *this_rq)
-{
- unsigned long this_load = this_rq->load.weight;
- unsigned long curr_jiffies = jiffies;
- unsigned long pending_updates;
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Avoid repeated calls on same jiffy, when moving in and out of idle */
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-static void update_cpu_load_active(struct rq *this_rq)
-{
- update_cpu_load(this_rq);
-
- calc_load_account_active(this_rq);
-}
-
#ifdef CONFIG_SMP
/*
@@ -2515,7 +2398,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2550,7 +2433,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock_task - p->se.exec_start;
+ ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
@@ -2582,404 +2465,26 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns = 0;
- rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-#ifdef CONFIG_CGROUP_CPUACCT
-struct cgroup_subsys cpuacct_subsys;
-struct cpuacct root_cpuacct;
-#endif
-
-static inline void task_group_account_field(struct task_struct *p, int index,
- u64 tmp)
-{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
- * Since all updates are sure to touch the root cgroup, we
- * get ourselves ahead and touch it first. If the root cgroup
- * is the only cgroup, then nothing else should be necessary.
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
*
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
*/
- __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
+ if (!p->on_cpu)
+ return p->se.sum_exec_runtime;
#endif
-}
-
-
-/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- int index;
-
- /* Add user time to process. */
- p->utime += cputime;
- p->utimescaled += cputime_scaled;
- account_group_user_time(p, cputime);
-
- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
-
- /* Add user time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
-
- /* Account for user time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- /* Add guest time to process. */
- p->utime += cputime;
- p->utimescaled += cputime_scaled;
- account_group_user_time(p, cputime);
- p->gtime += cputime;
-
- /* Add guest time to cpustat. */
- if (TASK_NICE(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
- } else {
- cpustat[CPUTIME_USER] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST] += (__force u64) cputime;
- }
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, int index)
-{
- /* Add system time to process. */
- p->stime += cputime;
- p->stimescaled += cputime_scaled;
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
-
- /* Account for system time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
-{
- int index;
-
- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- account_guest_time(p, cputime, cputime_scaled);
- return;
- }
-
- if (hardirq_count() - hardirq_offset)
- index = CPUTIME_IRQ;
- else if (in_serving_softirq())
- index = CPUTIME_SOFTIRQ;
- else
- index = CPUTIME_SYSTEM;
-
- __account_system_time(p, cputime, cputime_scaled, index);
-}
-
-/*
- * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- cpustat[CPUTIME_STEAL] += (__force u64) cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();
-
- if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
- else
- cpustat[CPUTIME_IDLE] += (__force u64) cputime;
-}
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
- if (static_branch(&paravirt_steal_enabled)) {
- u64 steal, st = 0;
-
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
-
- st = steal_ticks(steal);
- this_rq()->prev_steal_time += st * TICK_NSEC;
-
- account_steal_time(st);
- return st;
- }
-#endif
- return false;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
- * @user_tick: is the tick from userspace
- * @rq: the pointer to rq
- *
- * Tick demultiplexing follows the order
- * - pending hardirq update
- * - pending softirq update
- * - user_time
- * - idle_time
- * - system time
- * - check for guest_time
- * - else account as system_time
- *
- * Check for hardirq is done both for system and user time as there is
- * no timer going off while we are on hardirq and hence we may never get an
- * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
- * softirq as those do not count in task exec_runtime any more.
- */
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- if (steal_account_process_tick())
- return;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
- } else if (this_cpu_ksoftirqd() == p) {
- /*
- * ksoftirqd time do not get accounted in cpu_softirq_time.
- * So, we have to handle it separately here.
- * Also, p->stime needs to be updated for ksoftirqd.
- */
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
- } else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- } else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
- } else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
- } else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
- }
-}
-
-static void irqtime_account_idle_ticks(int ticks)
-{
- int i;
- struct rq *rq = this_rq();
-
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- struct rq *rq = this_rq();
-
- if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
- return;
- }
-
- if (steal_account_process_tick())
- return;
-
- if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
- else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
- if (sched_clock_irqtime) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- account_idle_time(jiffies_to_cputime(ticks));
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-#else
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
-#endif
-
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- cputime_t rtime, utime = p->utime, total = utime + p->stime;
-
- /*
- * Use CFS's precise accounting:
- */
- rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
- utime = rtime;
-
- /*
- * Compare with previous values, to keep monotonicity:
- */
- p->prev_utime = max(p->prev_utime, utime);
- p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
-
- *ut = p->prev_utime;
- *st = p->prev_stime;
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct signal_struct *sig = p->signal;
- struct task_cputime cputime;
- cputime_t rtime, utime, total;
-
- thread_group_cputime(p, &cputime);
-
- total = cputime.utime + cputime.stime;
- rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) cputime.utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
- utime = rtime;
-
- sig->prev_utime = max(sig->prev_utime, utime);
- sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, p, &flags);
- *ut = sig->prev_utime;
- *st = sig->prev_stime;
+ return ns;
}
-#endif
/*
* This function gets called by the timer code, with HZ frequency.
@@ -2995,18 +2500,47 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
+ update_cpu_load_active(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq, cpu);
+ trigger_load_balance(rq);
#endif
+ rq_last_tick_reset(rq);
}
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = ACCESS_ONCE(jiffies);
+
+ next = rq->last_sched_tick + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_nsecs(next - now);
+}
+#endif
+
notrace unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
@@ -3020,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
+void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -3029,7 +2563,7 @@ void __kprobes add_preempt_count(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
- preempt_count() += val;
+ __preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
@@ -3037,12 +2571,18 @@ void __kprobes add_preempt_count(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val)
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
}
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
-void __kprobes sub_preempt_count(int val)
+void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -3060,9 +2600,10 @@ void __kprobes sub_preempt_count(int val)
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
- preempt_count() -= val;
+ __preempt_count_sub(val);
}
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
#endif
@@ -3071,8 +2612,6 @@ EXPORT_SYMBOL(sub_preempt_count);
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
- struct pt_regs *regs = get_irq_regs();
-
if (oops_in_progress)
return;
@@ -3083,11 +2622,15 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
/*
@@ -3097,10 +2640,10 @@ static inline void schedule_debug(struct task_struct *prev)
{
/*
* Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path for now.
- * Otherwise, whine if we are scheduling when we should not be.
+ * schedule() atomically, we ignore that path. Otherwise whine
+ * if we are scheduling when we should not.
*/
- if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+ if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
__schedule_bug(prev);
rcu_sleep_check();
@@ -3109,36 +2652,40 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
- if (prev->on_rq || rq->skip_clock_update < 0)
- update_rq_clock(rq);
- prev->sched_class->put_prev_task(rq, prev);
-}
-
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
- const struct sched_class *class;
+ const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
- if (likely(p))
- return p;
+ if (likely(prev->sched_class == class &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev);
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
+again:
for_each_class(class) {
- p = class->pick_next_task(rq);
- if (p)
+ p = class->pick_next_task(rq, prev);
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
return p;
+ }
}
BUG(); /* the idle class will always have a runnable task */
@@ -3146,6 +2693,40 @@ pick_next_task(struct rq *rq)
/*
* __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
*/
static void __sched __schedule(void)
{
@@ -3166,6 +2747,12 @@ need_resched:
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
@@ -3192,14 +2779,12 @@ need_resched:
switch_count = &prev->nvcsw;
}
- pre_schedule(rq, prev);
-
- if (unlikely(!rq->nr_running))
- idle_balance(cpu, rq);
+ if (prev->on_rq || rq->skip_clock_update < 0)
+ update_rq_clock(rq);
- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -3221,14 +2806,14 @@ need_resched:
post_schedule(rq);
- preempt_enable_no_resched();
+ sched_preempt_enable_no_resched();
if (need_resched())
goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state)
+ if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
/*
* If we are going to sleep and we have plugged IO queued,
@@ -3238,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -3247,50 +2832,32 @@ asmlinkage void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
{
- if (lock->owner != owner)
- return false;
-
/*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
*/
- barrier();
-
- return owner->on_cpu;
+ user_exit();
+ schedule();
+ user_enter();
}
+#endif
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
*/
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+void __sched schedule_preempt_disabled(void)
{
- if (!sched_feat(OWNER_SPIN))
- return 0;
-
- rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
- break;
-
- arch_mutex_cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
}
-#endif
#ifdef CONFIG_PREEMPT
/*
@@ -3298,21 +2865,19 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
{
- struct thread_info *ti = current_thread_info();
-
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (likely(ti->preempt_count || irqs_disabled()))
+ if (likely(!preemptible()))
return;
do {
- add_preempt_count_notrace(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
__schedule();
- sub_preempt_count_notrace(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -3321,7 +2886,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
barrier();
} while (need_resched());
}
+NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -3329,19 +2896,21 @@ EXPORT_SYMBOL(preempt_schedule);
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
{
- struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_state;
/* Catch callers which need to be fixed */
- BUG_ON(ti->preempt_count || !irqs_disabled());
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
do {
- add_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
local_irq_enable();
__schedule();
local_irq_disable();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -3349,9 +2918,9 @@ asmlinkage void __sched preempt_schedule_irq(void)
*/
barrier();
} while (need_resched());
-}
-#endif /* CONFIG_PREEMPT */
+ exception_exit(prev_state);
+}
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
@@ -3360,392 +2929,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
-{
- wait_queue_t *curr, *next;
-
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
- break;
- }
-}
-
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
-{
- __wake_up_common(q, mode, 1, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
- __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
- int wake_flags = WF_SYNC;
-
- if (unlikely(!q))
- return;
-
- if (unlikely(!nr_exclusive))
- wake_flags = 0;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
- __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
-
-/**
- * complete: - signals a single thread waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
- __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-
-static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
-{
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
-
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
- do {
- if (signal_pending_state(state, current)) {
- timeout = -ERESTARTSYS;
- break;
- }
- __set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&x->wait.lock);
- } while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
- if (!x->done)
- return timeout;
- }
- x->done--;
- return timeout ?: 1;
-}
-
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
- might_sleep();
-
- spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, timeout, state);
- spin_unlock_irq(&x->wait.lock);
- return timeout;
-}
-
-/**
- * wait_for_completion: - waits for completion of a task
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
- wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x: holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-
-/**
- * try_wait_for_completion - try to decrement a completion without blocking
- * @x: completion structure
- *
- * Returns: 0 if a decrement cannot be done without blocking
- * 1 if a decrement succeeded.
- *
- * If a completion is being used as a counting completion,
- * attempt to decrement the counter without blocking. This
- * enables us to avoid waiting if the resource the completion
- * is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- else
- x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-
-/**
- * completion_done - Test to see if a completion has any waiters
- * @x: completion structure
- *
- * Returns: 0 if there are waiters (wait_for_completion() in progress)
- * 1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(completion_done);
-
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
- unsigned long flags;
- wait_queue_t wait;
-
- init_waitqueue_entry(&wait, current);
-
- __set_current_state(state);
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, &wait);
- spin_unlock(&q->lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&q->lock);
- __remove_wait_queue(q, &wait);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
#ifdef CONFIG_RT_MUTEXES
/*
@@ -3756,19 +2939,39 @@ EXPORT_SYMBOL(sleep_on_timeout);
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running;
+ int oldprio, on_rq, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
- BUG_ON(prio < 0 || prio > MAX_PRIO);
+ BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p);
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
trace_sched_pi_setprio(p, prio);
+ p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->on_rq;
@@ -3778,22 +2981,47 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->put_prev_task(rq, p);
- if (rt_prio(prio))
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+ dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+ p->dl.dl_boosted = 1;
+ p->dl.dl_throttled = 0;
+ enqueue_flag = ENQUEUE_REPLENISH;
+ } else
+ p->dl.dl_boosted = 0;
+ p->sched_class = &dl_sched_class;
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (oldprio < prio)
+ enqueue_flag = ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
- else
+ } else {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
p->sched_class = &fair_sched_class;
+ }
p->prio = prio;
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+ enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
__task_rq_unlock(rq);
}
-
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -3802,7 +3030,7 @@ void set_user_nice(struct task_struct *p, long nice)
unsigned long flags;
struct rq *rq;
- if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
/*
* We have to be careful, if called from sys_setpriority(),
@@ -3813,9 +3041,9 @@ void set_user_nice(struct task_struct *p, long nice)
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
- * SCHED_FIFO/SCHED_RR:
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
*/
- if (task_has_rt_policy(p)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
@@ -3851,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = 20 - nice;
+ int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@@ -3875,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < -40)
- increment = -40;
- if (increment > 40)
- increment = 40;
-
- nice = TASK_NICE(current) + increment;
- if (nice < -20)
- nice = -20;
- if (nice > 19)
- nice = 19;
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3903,7 +3124,7 @@ SYSCALL_DEFINE1(nice, int, increment)
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
@@ -3913,18 +3134,10 @@ int task_prio(const struct task_struct *p)
}
/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- */
-int task_nice(const struct task_struct *p)
-{
- return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
-/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
@@ -3947,6 +3160,8 @@ int idle_cpu(int cpu)
/**
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -3956,26 +3171,134 @@ struct task_struct *idle_task(int cpu)
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_vpid(pid) : current;
}
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ init_dl_task_timer(dl_se);
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
+}
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == -1) /* setparam */
+ policy = p->policy;
+
p->policy = policy;
- p->rt_priority = prio;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- /* we are holding p->pi_lock already */
- p->prio = rt_mutex_getprio(p);
- if (rt_prio(p->prio))
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * If we get here, there was no pi waiters boosting the
+ * task. It is safe to use the normal prio.
+ */
+ p->prio = normal_prio(p);
+
+ if (dl_prio(p->prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
- set_load_weight(p);
+}
+
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3988,19 +3311,20 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- if (cred->user->user_ns == pcred->user->user_ns)
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
- else
- match = false;
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
rcu_read_unlock();
return match;
}
-static int __sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool user)
+static int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user)
{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
struct rq *rq;
@@ -4014,31 +3338,40 @@ recheck:
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
- reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
- policy &= ~SCHED_RESET_ON_FORK;
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ if (policy != SCHED_DEADLINE &&
+ policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
policy != SCHED_IDLE)
return -EINVAL;
}
+ if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ return -EINVAL;
+
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if (rt_policy(policy) != (param->sched_priority != 0))
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (user && !capable(CAP_SYS_NICE)) {
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !can_nice(p, attr->sched_nice))
+ return -EPERM;
+ }
+
if (rt_policy(policy)) {
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
@@ -4048,17 +3381,26 @@ recheck:
return -EPERM;
/* can't increase priority */
- if (param->sched_priority > p->rt_priority &&
- param->sched_priority > rlim_rtprio)
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
return -EPERM;
}
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ return -EPERM;
+
/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- if (!can_nice(p, TASK_NICE(p)))
+ if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -4095,18 +3437,25 @@ recheck:
}
/*
- * If not changing anything there's no need to proceed further:
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
*/
- if (unlikely(policy == p->policy && (!rt_policy(policy) ||
- param->sched_priority == p->rt_priority))) {
-
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy))
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ task_rq_unlock(rq, p, &flags);
return 0;
}
+change:
-#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
@@ -4117,8 +3466,24 @@ recheck:
task_rq_unlock(rq, p, &flags);
return -EPERM;
}
- }
#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, &p->cpus_allowed) ||
+ rq->rd->dl_bw.bw == 0) {
+ task_rq_unlock(rq, p, &flags);
+ return -EPERM;
+ }
+ }
+#endif
+ }
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -4126,6 +3491,35 @@ recheck:
task_rq_unlock(rq, p, &flags);
goto recheck;
}
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ task_rq_unlock(rq, p, &flags);
+ return -EBUSY;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Special case for priority boosted tasks.
+ *
+ * If the new priority is lower or equal (user space view)
+ * than the current (boosted) priority, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ if (rt_mutex_check_prio(p, newprio)) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -4133,16 +3527,18 @@ recheck:
if (running)
p->sched_class->put_prev_task(rq, p);
- p->sched_reset_on_fork = reset_on_fork;
-
- oldprio = p->prio;
prev_class = p->sched_class;
- __setscheduler(rq, p, policy, param->sched_priority);
+ __setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
+ if (on_rq) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -4152,21 +3548,49 @@ recheck:
return 0;
}
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /*
+ * Fixup the legacy SCHED_RESET_ON_FORK hack
+ */
+ if (policy & SCHED_RESET_ON_FORK) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check);
+}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Return: 0 on success. An error code otherwise.
+ *
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, true);
+ return _sched_setscheduler(p, policy, param, true);
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
@@ -4177,11 +3601,13 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, false);
+ return _sched_setscheduler(p, policy, param, false);
}
static int
@@ -4206,11 +3632,84 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
return retval;
}
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
struct sched_param __user *, param)
@@ -4226,6 +3725,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -4233,8 +3734,44 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
}
/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
@@ -4261,10 +3798,13 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -4281,7 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -4296,19 +3837,103 @@ out_unlock:
return retval;
}
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ return -EFBIG;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, attr->size);
+ if (ret)
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, &attr);
+ else if (task_has_rt_policy(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
cpumask_var_t cpus_allowed, new_mask;
struct task_struct *p;
int retval;
- get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
- put_online_cpus();
return -ESRCH;
}
@@ -4316,6 +3941,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
get_task_struct(p);
rcu_read_unlock();
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
@@ -4325,15 +3954,39 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- goto out_unlock;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
retval = security_task_setscheduler(p);
if (retval)
goto out_unlock;
+
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+#ifdef CONFIG_SMP
+ if (task_has_dl_policy(p)) {
+ const struct cpumask *span = task_rq(p)->rd->span;
+
+ if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+ retval = -EBUSY;
+ goto out_unlock;
+ }
+ }
+#endif
again:
retval = set_cpus_allowed_ptr(p, new_mask);
@@ -4355,7 +4008,6 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
- put_online_cpus();
return retval;
}
@@ -4375,6 +4027,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4398,7 +4052,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
- get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -4411,12 +4064,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
- put_online_cpus();
return retval;
}
@@ -4426,6 +4078,8 @@ out_unlock:
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4460,6 +4114,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
@@ -4475,23 +4131,18 @@ SYSCALL_DEFINE0(sched_yield)
__release(rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
do_raw_spin_unlock(&rq->lock);
- preempt_enable_no_resched();
+ sched_preempt_enable_no_resched();
schedule();
return 0;
}
-static inline int should_resched(void)
-{
- return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
static void __cond_resched(void)
{
- add_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_add(PREEMPT_ACTIVE);
__schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __preempt_count_sub(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -4549,8 +4200,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
/**
* yield - yield the current processor to other threads.
*
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
*/
void __sched yield(void)
{
@@ -4569,34 +4236,46 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns true if we indeed boosted the target task.
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
*/
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
unsigned long flags;
- bool yielded = 0;
+ int yielded = 0;
local_irq_save(flags);
rq = this_rq();
again:
p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
double_rq_lock(rq, p_rq);
- while (task_rq(p) != p_rq) {
+ if (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
goto again;
}
if (!curr->sched_class->yield_to_task)
- goto out;
+ goto out_unlock;
if (curr->sched_class != p->sched_class)
- goto out;
+ goto out_unlock;
if (task_running(p_rq, p) || p->state)
- goto out;
+ goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
@@ -4607,20 +4286,14 @@ again:
*/
if (preempt && rq != p_rq)
resched_task(p_rq->curr);
- } else {
- /*
- * We might have set it in task_yield_fair(), but are
- * not going to schedule(), so don't want to skip
- * the next update.
- */
- rq->skip_clock_update = 0;
}
-out:
+out_unlock:
double_rq_unlock(rq, p_rq);
+out_irq:
local_irq_restore(flags);
- if (yielded)
+ if (yielded > 0)
schedule();
return yielded;
@@ -4666,8 +4339,9 @@ long __sched io_schedule_timeout(long timeout)
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
@@ -4678,6 +4352,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
case SCHED_RR:
ret = MAX_USER_RT_PRIO-1;
break;
+ case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
@@ -4691,8 +4366,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
@@ -4703,6 +4379,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
case SCHED_RR:
ret = 1;
break;
+ case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
@@ -4718,6 +4395,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
*
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
@@ -4743,7 +4423,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
goto out_unlock;
rq = task_rq_lock(p, &flags);
- time_slice = p->sched_class->get_rr_interval(rq, p);
+ time_slice = 0;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
@@ -4761,6 +4443,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
+ int ppid;
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4780,10 +4463,14 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ rcu_read_lock();
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
+ task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
+ print_worker_info(KERN_INFO, p);
show_stack(p, NULL);
}
@@ -4822,7 +4509,7 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
{
idle->sched_class = &idle_sched_class;
}
@@ -4835,14 +4522,14 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
- __sched_fork(idle);
+ __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -4862,19 +4549,21 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
+ idle->on_rq = 1;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
- task_thread_info(idle)->preempt_count = 0;
+ init_idle_preempt_count(idle, cpu);
/*
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle, cpu);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
@@ -4887,7 +4576,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
/*
@@ -4930,11 +4619,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
- ret = -EINVAL;
- goto out;
- }
-
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
@@ -5006,6 +4690,54 @@ fail:
return ret;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+ struct migration_arg arg = { p, target_cpu };
+ int curr_cpu = task_cpu(p);
+
+ if (curr_cpu == target_cpu)
+ return 0;
+
+ if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ return -EINVAL;
+
+ /* TODO: This is not properly updating schedstats */
+
+ trace_sched_move_numa(p, curr_cpu, target_cpu);
+ return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+ struct rq *rq;
+ unsigned long flags;
+ bool on_rq, running;
+
+ rq = task_rq_lock(p, &flags);
+ on_rq = p->on_rq;
+ running = task_current(rq, p);
+
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+
+ p->numa_preferred_nid = nid;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq)
+ enqueue_task(rq, p, 0);
+ task_rq_unlock(rq, p, &flags);
+}
+#endif
+
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -5037,35 +4769,43 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
- if (mm != &init_mm)
+ if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
mmdrop(mm);
}
/*
- * While a dead CPU has no uninterruptible tasks queued at this point,
- * it might still have a nonzero ->nr_uninterruptible counter, because
- * for performance reasons the counter is not stricly tracking tasks to
- * their home CPUs. So we just add the counter to another CPU's counter,
- * to keep the global sum constant after CPU-down:
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable.
+ *
+ * Also see the comment "Global load-average calculations".
*/
-static void migrate_nr_uninterruptible(struct rq *rq_src)
+static void calc_load_migrate(struct rq *rq)
{
- struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-
- rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
- rq_src->nr_uninterruptible = 0;
+ long delta = calc_load_fold_active(rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
}
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -5091,8 +4831,12 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
rq->stop = NULL;
- /* Ensure any throttled groups are reachable by pick_next_task */
- unthrottle_offline_cfs_rqs(rq);
+ /*
+ * put_prev_task() and pick_next_task() sched
+ * class method both need to have an up-to-date
+ * value of rq->clock[_task]
+ */
+ update_rq_clock(rq);
for ( ; ; ) {
/*
@@ -5102,7 +4846,7 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
- next = pick_next_task(rq);
+ next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5168,57 +4912,69 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler)
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(13);
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[11], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring);
- /* &table[12] is terminator */
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
return table;
}
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
@@ -5320,7 +5076,7 @@ static void set_rq_offline(struct rq *rq)
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int __cpuinit
+static int
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
@@ -5356,9 +5112,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
migrate_tasks(cpu);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ break;
- migrate_nr_uninterruptible(rq);
- calc_global_load_remove(rq);
+ case CPU_DEAD:
+ calc_load_migrate(rq);
break;
#endif
}
@@ -5373,16 +5130,25 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
* happens before everything else. This has to be lower priority than
* the notifier in the perf_event subsystem, though.
*/
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
.notifier_call = migration_call,
.priority = CPU_PRI_MIGRATION,
};
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+}
+
+static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
+ case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5391,16 +5157,34 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
}
}
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ unsigned long flags;
+ long cpu = (long)hcpu;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
+ set_cpu_active(cpu, false);
+
+ /* explicitly allow suspend */
+ if (!(action & CPU_TASKS_FROZEN)) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ bool overflow;
+ int cpus;
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ }
return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
}
+
+ return NOTIFY_DONE;
}
static int __init migration_init(void)
@@ -5429,15 +5213,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
{
- sched_domain_debug_enabled = 1;
+ sched_debug_enabled = 1;
return 0;
}
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
@@ -5477,10 +5266,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->sgp->power) {
+ /*
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
+ * domain iteration is still funny without causing /0 traps.
+ */
+ if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5490,7 +5283,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
@@ -5501,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->sgp->power != SCHED_POWER_SCALE) {
- printk(KERN_CONT " (cpu_power = %d)",
- group->sgp->power);
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
}
group = group->next;
@@ -5524,7 +5318,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_domain_debug_enabled)
+ if (!sched_debug_enabled)
return;
if (!sd) {
@@ -5545,6 +5339,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd)
@@ -5557,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5587,8 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES);
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5603,6 +5404,8 @@ static void free_rootdomain(struct rcu_head *rcu)
struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
@@ -5625,7 +5428,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rt yet then
+ * If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -5654,8 +5457,14 @@ static int init_rootdomain(struct root_domain *rd)
goto out;
if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
goto free_online;
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_dlo_mask;
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
@@ -5663,6 +5472,8 @@ static int init_rootdomain(struct root_domain *rd)
free_rto_mask:
free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
free_online:
free_cpumask_var(rd->online);
free_span:
@@ -5700,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5711,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
- if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
- kfree(sg->sgp);
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5730,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgp);
+ kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5754,22 +5565,39 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
*
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see ttwu_share_cache().
+ * two cpus are in the same cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
+ struct sched_domain *busy_sd = NULL;
int id = cpu;
+ int size = 1;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ busy_sd = sd->parent; /* sd_busy */
+ }
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
@@ -5792,6 +5620,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
@@ -5828,105 +5663,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
- int i, n, val, min_val, best_node = -1;
-
- min_val = INT_MAX;
-
- for (i = 0; i < nr_node_ids; i++) {
- /* Start at @node */
- n = (node + i) % nr_node_ids;
-
- if (!nr_cpus_node(n))
- continue;
-
- /* Skip already used nodes */
- if (node_isset(n, *used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node != -1)
- node_set(best_node, *used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
- nodemask_t used_nodes;
- int i;
-
- cpumask_clear(span);
- nodes_clear(used_nodes);
-
- cpumask_or(span, span, cpumask_of_node(node));
- node_set(node, used_nodes);
-
- for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, &used_nodes);
- if (next_node < 0)
- break;
- cpumask_or(span, span, cpumask_of_node(next_node));
- }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
- lockdep_assert_held(&sched_domains_mutex);
-
- sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
- return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
- return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
- return cpumask_of_node(cpu_to_node(cpu));
-}
-
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
-struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
-};
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -5939,19 +5675,43 @@ enum s_alloc {
sa_none,
};
-struct sched_domain_topology_level;
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
-#define SDTL_OVERLAP 0x01
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
-struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- struct sd_data data;
-};
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
@@ -5971,6 +5731,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -5978,8 +5744,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -5988,10 +5752,25 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
- atomic_inc(&sg->sgp->ref);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ build_group_mask(sd, sg);
- if (cpumask_test_cpu(cpu, sg_span))
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->capacity_orig = sg->sgc->capacity;
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ group_balance_cpu(sg) == cpu)
groups = sg;
if (!first)
@@ -6021,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
- atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -6031,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -6047,7 +5826,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
get_group(cpu, sdd, &sd->groups);
atomic_inc(&sd->groups->ref);
- if (cpu != cpumask_first(sched_domain_span(sd)))
+ if (cpu != cpumask_first(span))
return 0;
lockdep_assert_held(&sched_domains_mutex);
@@ -6057,14 +5836,13 @@ build_sched_groups(struct sched_domain *sd, int cpu)
for_each_cpu(i, span) {
struct sched_group *sg;
- int group = get_group(i, sdd, &sg);
- int j;
+ int group, j;
if (cpumask_test_cpu(i, covered))
continue;
- cpumask_clear(sched_group_cpus(sg));
- sg->sgp->power = 0;
+ group = get_group(i, sdd, &sg);
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6086,36 +5864,31 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
*
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
*/
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
- WARN_ON(!sd || !sg);
+ WARN_ON(!sg);
do {
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
- update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -6123,48 +5896,13 @@ int __weak arch_sd_sibling_asym_packing(void)
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type) sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type) do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type) \
-static noinline struct sched_domain * \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
-{ \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
- unsigned long val;
-
- val = simple_strtoul(str, NULL, 0);
- if (val < sched_domain_level_max)
- default_relax_domain_level = val;
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
return 1;
}
@@ -6241,46 +5979,399 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
- *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
{
- return topology_thread_cpumask(cpu);
-}
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
#endif
+ };
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
+
+ return sd;
+}
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
-#endif
- { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
- { sd_init_ALLNODES, cpu_allnodes_mask, },
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
int j;
- for (tl = sched_domain_topology; tl->init; tl++) {
+ for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6291,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
- sdd->sgp = alloc_percpu(struct sched_group_power *);
- if (!sdd->sgp)
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6312,14 +6403,16 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sg)
return -ENOMEM;
+ sg->next = sg;
+
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
- if (!sgp)
+ if (!sgc)
return -ENOMEM;
- *per_cpu_ptr(sdd->sgp, j) = sgp;
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6331,40 +6424,49 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sched_domain_topology_level *tl;
int j;
- for (tl = sched_domain_topology; tl->init; tl++) {
+ for_each_sd_topology(tl) {
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- kfree(*per_cpu_ptr(sdd->sg, j));
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
+ sdd->sd = NULL;
free_percpu(sdd->sg);
- free_percpu(sdd->sgp);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
}
}
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- struct s_data *d, const struct cpumask *cpu_map,
- struct sched_domain_attr *attr, struct sched_domain *child,
- int cpu)
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
- set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
+ sd->child = child;
}
- sd->child = child;
+ set_domain_attribute(sd, attr);
return sd;
}
@@ -6376,7 +6478,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state = sa_none;
+ enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;
@@ -6390,18 +6492,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++) {
- sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
-
- while (sd->child)
- sd = sd->child;
-
- *per_cpu_ptr(d.sd, i) = sd;
}
/* Build the groups for the domains */
@@ -6418,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
- /* Calculate CPU power for physical packages and nodes */
+ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
- init_sched_groups_power(i, sd);
+ init_sched_groups_capacity(i, sd);
}
}
@@ -6460,7 +6559,7 @@ static cpumask_var_t fallback_doms;
* cpu core maps. It is supposed to return 1 if the topology changed
* or 0 if it stayed the same.
*/
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
{
return 0;
}
@@ -6505,7 +6604,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- dattr_cur = NULL;
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
@@ -6597,8 +6695,9 @@ match1:
;
}
+ n = ndoms_cur;
if (doms_new == NULL) {
- ndoms_cur = 0;
+ n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
WARN_ON_ONCE(dattr_new);
@@ -6606,7 +6705,7 @@ match1:
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur && !new_topology; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
@@ -6630,125 +6729,66 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
- get_online_cpus();
-
- /* Destroy domains first to force the rebuild */
- partition_sched_domains(0, NULL, NULL);
-
- rebuild_sched_domains();
- put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
- unsigned int level = 0;
-
- if (sscanf(buf, "%u", &level) != 1)
- return -EINVAL;
-
- /*
- * level is always be positive so don't check for
- * level < POWERSAVINGS_BALANCE_NONE which is 0
- * What happens on 0 or 1 byte write,
- * need to check for count as well?
- */
-
- if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
- return -EINVAL;
-
- if (smt)
- sched_smt_power_savings = level;
- else
- sched_mc_power_savings = level;
-
- reinit_sched_domains();
-
- return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
- sched_mc_power_savings_show,
- sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
- sched_smt_power_savings_show,
- sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
- int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
- if (smt_capable())
- err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
- if (!err && mc_capable())
- err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
- return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
void __init sched_init_smp(void)
@@ -6758,21 +6798,24 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- get_online_cpus();
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
- put_online_cpus();
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
- /* RT runtime code needs to handle some hotplug events */
- hotcpu_notifier(update_runtime, 0);
-
init_hrtick();
/* Move init over to a non-isolated CPU */
@@ -6782,6 +6825,7 @@ void __init sched_init_smp(void)
free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
+ init_sched_dl_class();
}
#else
void __init sched_init_smp(void)
@@ -6800,10 +6844,15 @@ int in_sched_functions(unsigned long addr)
}
#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
struct task_group root_task_group;
+LIST_HEAD(task_groups);
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void)
{
@@ -6840,19 +6889,21 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
- per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ per_cpu(load_balance_mask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
}
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth,
+ global_rt_period(), global_rt_runtime());
+
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
-
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -6866,12 +6917,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
- root_cpuacct.cpustat = &kernel_cpustat;
- root_cpuacct.cpuusage = alloc_percpu(u64);
- /* Too early, not expected to fail */
- BUG_ON(!root_cpuacct.cpuusage);
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -6882,6 +6927,7 @@ void __init sched_init(void)
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
+ init_dl_rq(&rq->dl, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6910,7 +6956,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
- INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -6922,7 +6967,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_POWER_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -6931,10 +6976,17 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+
+ INIT_LIST_HEAD(&rq->cfs_tasks);
+
rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
rq->nohz_flags = 0;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = 0;
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -6946,10 +6998,6 @@ void __init sched_init(void)
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
-#ifdef CONFIG_RT_MUTEXES
- plist_head_init(&init_task.pi_waiters);
-#endif
-
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -6976,6 +7024,8 @@ void __init sched_init(void)
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
#endif
init_sched_fair_class();
@@ -6995,7 +7045,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -7013,6 +7064,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
}
EXPORT_SYMBOL(__might_sleep);
@@ -7022,13 +7080,16 @@ EXPORT_SYMBOL(__might_sleep);
static void normalize_task(struct rq *rq, struct task_struct *p)
{
const struct sched_class *prev_class = p->sched_class;
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ };
int old_prio = p->prio;
int on_rq;
on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
- __setscheduler(rq, p, SCHED_NORMAL, 0);
+ __setscheduler(rq, p, &attr);
if (on_rq) {
enqueue_task(rq, p, 0);
resched_task(rq->curr);
@@ -7058,12 +7119,12 @@ void normalize_rt_tasks(void)
p->se.statistics.block_start = 0;
#endif
- if (!rt_task(p)) {
+ if (!dl_task(p) && !rt_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
*/
- if (TASK_NICE(p) < 0 && p->mm)
+ if (task_nice(p) < 0 && p->mm)
set_user_nice(p, 0);
continue;
}
@@ -7098,6 +7159,8 @@ void normalize_rt_tasks(void)
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
*/
struct task_struct *curr_task(int cpu)
{
@@ -7145,7 +7208,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7157,6 +7219,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7166,12 +7239,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7184,6 +7251,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7195,9 +7268,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7207,6 +7277,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -7221,6 +7292,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
@@ -7237,16 +7314,6 @@ void sched_move_task(struct task_struct *tsk)
}
#endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
- if (runtime == RUNTIME_INF)
- return 1ULL << 20;
-
- return div64_u64(runtime << 20, period);
-}
-#endif
-
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Ensure that the real time constraints are schedulable.
@@ -7375,7 +7442,7 @@ unlock:
return err;
}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;
@@ -7387,7 +7454,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
@@ -7399,7 +7466,7 @@ long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
{
u64 rt_runtime, rt_period;
@@ -7412,7 +7479,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;
@@ -7420,24 +7487,13 @@ long sched_group_rt_period(struct task_group *tg)
do_div(rt_period_us, NSEC_PER_USEC);
return rt_period_us;
}
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
static int sched_rt_global_constraints(void)
{
- u64 runtime, period;
int ret = 0;
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- runtime = global_rt_runtime();
- period = global_rt_period();
-
- /*
- * Sanity check on the sysctl variables.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
ret = __rt_schedulable(NULL, 0, 0);
@@ -7447,7 +7503,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7460,17 +7516,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
static int sched_rt_global_constraints(void)
{
unsigned long flags;
- int i;
-
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- /*
- * There's always some RT tasks in the root group
- * -- migration, kstopmachine etc..
- */
- if (sysctl_sched_rt_runtime == 0)
- return -EBUSY;
+ int i, ret = 0;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
@@ -7482,17 +7528,91 @@ static int sched_rt_global_constraints(void)
}
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- return 0;
+ return ret;
}
#endif /* CONFIG_RT_GROUP_SCHED */
+static int sched_dl_global_constraints(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ int cpu, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ int cpu;
+ unsigned long flags;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ }
+}
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret;
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
+ int ret;
mutex_lock(&mutex);
old_period = sysctl_sched_rt_period;
@@ -7501,41 +7621,68 @@ int sched_rt_handler(struct ctl_table *table, int write,
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
ret = sched_rt_global_constraints();
- if (ret) {
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- } else {
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period =
- ns_to_ktime(global_rt_period());
- }
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
}
mutex_unlock(&mutex);
return ret;
}
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
#ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
- struct task_group, css);
+ return css ? container_of(css, struct task_group, css) : NULL;
}
static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct task_group *tg, *parent;
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
- if (!cgrp->parent) {
+ if (!parent) {
/* This is early initialization for the top cgroup */
return &root_task_group.css;
}
- parent = cgroup_tg(cgrp->parent);
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
@@ -7543,22 +7690,38 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
return &tg->css;
}
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_destroy_group(tg);
}
-static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
@@ -7569,18 +7732,18 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
return 0;
}
-static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, tset)
sched_move_task(task);
}
-static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task)
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -7594,15 +7757,16 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 shareval)
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
return (u64) scale_load_down(tg->shares);
}
@@ -7646,7 +7810,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7655,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
/* restart the period timer (if active) to handle new period expiry */
if (runtime_enabled && cfs_b->timer_active) {
/* force a reprogram */
- cfs_b->timer_active = 0;
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, true);
}
raw_spin_unlock_irq(&cfs_b->lock);
@@ -7672,6 +7840,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
@@ -7724,26 +7894,28 @@ long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_quota(cgroup_tg(cgrp));
+ return tg_get_cfs_quota(css_tg(css));
}
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
- s64 cfs_quota_us)
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
{
- return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_period(cgroup_tg(cgrp));
+ return tg_get_cfs_period(css_tg(css));
}
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 cfs_period_us)
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
{
- return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
struct cfs_schedulable_data {
@@ -7824,15 +7996,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+static int cpu_stats_show(struct seq_file *sf, void *v)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- cb->fill(cb, "nr_periods", cfs_b->nr_periods);
- cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
- cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+ seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+ seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+ seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
return 0;
}
@@ -7840,26 +8011,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
- s64 val)
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
{
- return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ return sched_group_set_rt_runtime(css_tg(css), val);
}
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_runtime(cgroup_tg(cgrp));
+ return sched_group_rt_runtime(css_tg(css));
}
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_period_us)
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
{
- return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
}
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_period(cgroup_tg(cgrp));
+ return sched_group_rt_period(css_tg(css));
}
#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7884,7 +8057,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
- .read_map = cpu_stats_show,
+ .seq_show = cpu_stats_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -7899,247 +8072,25 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+ { } /* terminate */
};
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
-struct cgroup_subsys cpu_cgroup_subsys = {
- .name = "cpu",
- .create = cpu_cgroup_create,
- .destroy = cpu_cgroup_destroy,
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .populate = cpu_cgroup_populate,
- .subsys_id = cpu_cgroup_subsys_id,
+ .base_cftypes = cpu_files,
.early_init = 1,
};
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
- struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- struct cpuacct *ca;
-
- if (!cgrp->parent)
- return &root_cpuacct.css;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- goto out;
-
- ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage)
- goto out_free_ca;
-
- ca->cpustat = alloc_percpu(struct kernel_cpustat);
- if (!ca->cpustat)
- goto out_free_cpuusage;
-
- return &ca->css;
-
-out_free_cpuusage:
- free_percpu(ca->cpuusage);
-out_free_ca:
- kfree(ca);
-out:
- return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
-
- free_percpu(ca->cpustat);
- free_percpu(ca->cpuusage);
- kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 data;
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
-#endif
-
- return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit write safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- u64 totalcpuusage = 0;
- int i;
-
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
- return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int err = 0;
- int i;
-
- if (reset) {
- err = -EINVAL;
- goto out;
- }
-
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
-
-out:
- return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
-{
- struct cpuacct *ca = cgroup_ca(cgroup);
- u64 percpu;
- int i;
-
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
- seq_printf(m, "%llu ", (unsigned long long) percpu);
- }
- seq_printf(m, "\n");
- return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int cpu;
- s64 val = 0;
-
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
- val = 0;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
- }
-
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-
- return 0;
-}
-
-static struct cftype files[] = {
- {
- .name = "usage",
- .read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
- },
- {
- .name = "usage_percpu",
- .read_seq_string = cpuacct_percpu_seq_read,
- },
- {
- .name = "stat",
- .read_map = cpuacct_stats_show,
- },
-};
-
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+void dump_cpu_task(int cpu)
{
- return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
}
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
- struct cpuacct *ca;
- int cpu;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- cpu = task_cpu(tsk);
-
- rcu_read_lock();
-
- ca = task_ca(tsk);
-
- for (; ca; ca = parent_ca(ca)) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
- }
-
- rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
- .create = cpuacct_create,
- .destroy = cpuacct_destroy,
- .populate = cpuacct_populate,
- .subsys_id = cpuacct_subsys_id,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 00000000000..9cf350c94ec
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,283 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+
+#include "sched.h"
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuacct, css) : NULL;
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ return css_ca(ca->css.parent);
+}
+
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cpuacct *ca;
+
+ if (!parent_css)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuacct *ca = css_ca(css);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuacct *ca = css_ca(css);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 reset)
+{
+ struct cpuacct *ca = css_ca(css);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+ return err;
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
+{
+ struct cpuacct *ca = css_ca(seq_css(sf));
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_percpu",
+ .seq_show = cpuacct_percpu_seq_show,
+ },
+ {
+ .name = "stat",
+ .seq_show = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int cpu;
+
+ cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
+ ca = task_ca(tsk);
+
+ while (true) {
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+
+ ca = parent_ca(ca);
+ if (!ca)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca != &root_cpuacct) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += val;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_cgrp_subsys = {
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .base_cftypes = files,
+ .early_init = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 00000000000..ed605624a5e
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+
+#else
+
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+
+#endif
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 00000000000..bd95963dae8
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,229 @@
+/*
+ * kernel/sched/cpudl.c
+ *
+ * Global CPU deadline management
+ *
+ * Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "cpudeadline.h"
+
+static inline int parent(int i)
+{
+ return (i - 1) >> 1;
+}
+
+static inline int left_child(int i)
+{
+ return (i << 1) + 1;
+}
+
+static inline int right_child(int i)
+{
+ return (i << 1) + 2;
+}
+
+static inline int dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+static void cpudl_exchange(struct cpudl *cp, int a, int b)
+{
+ int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ int l, r, largest;
+
+ /* adapted from lib/prio_heap.c */
+ while(1) {
+ l = left_child(idx);
+ r = right_child(idx);
+ largest = idx;
+
+ if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+ cp->elements[l].dl))
+ largest = l;
+ if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+ cp->elements[r].dl))
+ largest = r;
+ if (largest == idx)
+ break;
+
+ /* Push idx down the heap one level and bump one up */
+ cpudl_exchange(cp, largest, idx);
+ idx = largest;
+ }
+}
+
+static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+{
+ WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+
+ if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+ cp->elements[idx].dl = new_dl;
+ cpudl_heapify(cp, idx);
+ } else {
+ cp->elements[idx].dl = new_dl;
+ while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl)) {
+ cpudl_exchange(cp, idx, parent(idx));
+ idx = parent(idx);
+ }
+ }
+}
+
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+ return cp->elements[0].cpu;
+}
+
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - best CPU (heap maximum if suitable)
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask)
+{
+ int best_cpu = -1;
+ const struct sched_dl_entity *dl_se = &p->dl;
+
+ if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+ &p->cpus_allowed) && cpumask_and(later_mask,
+ later_mask, cpu_active_mask)) {
+ best_cpu = cpumask_any(later_mask);
+ goto out;
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ best_cpu = cpudl_maximum(cp);
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
+ }
+
+out:
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+
+ return best_cpu;
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+{
+ int old_idx, new_cpu;
+ unsigned long flags;
+
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+ old_idx = cp->elements[cpu].idx;
+ if (!is_valid) {
+ /* remove item */
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if a rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ goto out;
+ }
+ new_cpu = cp->elements[cp->size - 1].cpu;
+ cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+ cp->elements[old_idx].cpu = new_cpu;
+ cp->size--;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
+ while (old_idx > 0 && dl_time_before(
+ cp->elements[parent(old_idx)].dl,
+ cp->elements[old_idx].dl)) {
+ cpudl_exchange(cp, old_idx, parent(old_idx));
+ old_idx = parent(old_idx);
+ }
+ cpumask_set_cpu(cpu, cp->free_cpus);
+ cpudl_heapify(cp, old_idx);
+
+ goto out;
+ }
+
+ if (old_idx == IDX_INVALID) {
+ cp->size++;
+ cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].cpu = cpu;
+ cp->elements[cpu].idx = cp->size - 1;
+ cpudl_change_key(cp, cp->size - 1, dl);
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+ } else {
+ cpudl_change_key(cp, old_idx, dl);
+ }
+
+out:
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+ raw_spin_lock_init(&cp->lock);
+ cp->size = 0;
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
+ cpumask_setall(cp->free_cpus);
+
+ return 0;
+}
+
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 00000000000..538c9796ad4
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_CPUDL_H
+#define _LINUX_CPUDL_H
+
+#include <linux/sched.h>
+
+#define IDX_INVALID -1
+
+struct cpudl_item {
+ u64 dl;
+ int cpu;
+ int idx;
+};
+
+struct cpudl {
+ raw_spinlock_t lock;
+ int size;
+ cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
+};
+
+
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+int cpudl_init(struct cpudl *cp);
+void cpudl_cleanup(struct cpudl *cp);
+#else
+#define cpudl_set(cp, cpu, dl) do { } while (0)
+#define cpudl_init() do { } while (0)
+#endif /* CONFIG_SMP */
+
+#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf66..981fcd7dc39 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,9 @@
*/
#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -60,16 +63,15 @@ static int convert_prio(int prio)
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
{
- int idx = 0;
- int task_pri = convert_prio(p->prio);
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -137,9 +139,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
- int *currpri = &cp->cpu_to_pri[cpu];
- int oldpri = *currpri;
- int do_mb = 0;
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
newpri = convert_prio(newpri);
@@ -163,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* do a write memory barrier, and then update the count, to
* make sure the vector is visible when count is set.
*/
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&(vec)->count);
do_mb = 1;
}
@@ -183,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* the new priority vec.
*/
if (do_mb)
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/*
* When removing from the vector, we decrement the counter first
* do a memory barrier and then clear the mask.
*/
atomic_dec(&(vec)->count);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
cpumask_clear_cpu(cpu, vec->mask);
}
@@ -201,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
*/
int cpupri_init(struct cpupri *cp)
{
@@ -217,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -235,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d75617349..6b033347fdf 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 00000000000..72fdf06ef86
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,838 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kernel_stat.h>
+#include <linux/static_key.h>
+#include <linux/context_tracking.h>
+#include "sched.h"
+
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in vtime_account, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/vtime_account on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+ unsigned long flags;
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+
+ irq_time_write_end();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+static int irqtime_account_hi_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime (0)
+
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
+
+ cpuacct_account_field(p, index, tmp);
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ int index;
+
+ /* Add user time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+ /* Add user time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for user time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ /* Add guest time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+ p->gtime += cputime;
+
+ /* Add guest time to cpustat. */
+ if (task_nice(p) > 0) {
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ } else {
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, int index)
+{
+ /* Add system time to process. */
+ p->stime += cputime;
+ p->stimescaled += cputime_scaled;
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for system time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+ cputime_t cputime, cputime_t cputime_scaled)
+{
+ int index;
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime, cputime_scaled);
+ return;
+ }
+
+ if (hardirq_count() - hardirq_offset)
+ index = CPUTIME_IRQ;
+ else if (in_serving_softirq())
+ index = CPUTIME_SOFTIRQ;
+ else
+ index = CPUTIME_SYSTEM;
+
+ __account_system_time(p, cputime, cputime_scaled, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct rq *rq = this_rq();
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ else
+ cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+}
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ u64 steal;
+ cputime_t steal_ct;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ /*
+ * cputime_t may be less precise than nsecs (eg: if it's
+ * based on jiffies). Lets cast the result to cputime
+ * granularity and account the rest on the next rounds.
+ */
+ steal_ct = nsecs_to_cputime(steal);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+
+ account_steal_time(steal_ct);
+ return steal_ct;
+ }
+#endif
+ return false;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct signal_struct *sig = tsk->signal;
+ cputime_t utime, stime;
+ struct task_struct *t;
+
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ rcu_read_lock();
+ /* make sure we can trust tsk->thread_group list */
+ if (!likely(pid_alive(tsk)))
+ goto out;
+
+ t = tsk;
+ do {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ } while_each_thread(tsk, t);
+out:
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int ticks)
+{
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ if (steal_account_process_tick())
+ return;
+
+ cputime *= ticks;
+ scaled *= ticks;
+
+ if (irqtime_account_hi_update()) {
+ cpustat[CPUTIME_IRQ] += cputime;
+ } else if (irqtime_account_si_update()) {
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+ } else if (user_tick) {
+ account_user_time(p, cputime, scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime, scaled);
+ } else {
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ struct rq *rq = this_rq();
+
+ irqtime_account_process_tick(current, 0, rq, ticks);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int nr_ticks) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_common_account_irq_enter(struct task_struct *tsk)
+{
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ struct rq *rq = this_rq();
+
+ if (vtime_accounting_enabled())
+ return;
+
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq, 1);
+ return;
+ }
+
+ if (steal_account_process_tick())
+ return;
+
+ if (user_tick)
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+ one_jiffy_scaled);
+ else
+ account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+ account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
+ account_idle_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+ u64 scaled;
+
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime)
+ swap(rtime, stime);
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
+
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+ struct cputime *prev,
+ cputime_t *ut, cputime_t *st)
+{
+ cputime_t rtime, stime, utime;
+
+ /*
+ * Tick based cputime accounting depend on random scheduling
+ * timeslices of a task to be interrupted or not by the timer.
+ * Depending on these circumstances, the number of these interrupts
+ * may be over or under-optimistic, matching the real user and system
+ * cputime with a variable precision.
+ *
+ * Fix this by scaling these tick based values against the total
+ * runtime accounted by the CFS scheduler.
+ */
+ rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ stime = curr->stime;
+ utime = curr->utime;
+
+ if (utime == 0) {
+ stime = rtime;
+ } else if (stime == 0) {
+ utime = rtime;
+ } else {
+ cputime_t total = stime + utime;
+
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ }
+
+ /*
+ * If the tick based count grows faster than the scheduler one,
+ * the result of the scaling may go backward.
+ * Let's enforce monotonicity.
+ */
+ prev->stime = max(prev->stime, stime);
+ prev->utime = max(prev->utime, utime);
+
+out:
+ *ut = prev->utime;
+ *st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime = {
+ .sum_exec_runtime = p->se.sum_exec_runtime,
+ };
+
+ task_cputime(p, &cputime.utime, &cputime.stime);
+ cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long clock;
+
+ clock = local_clock();
+ if (clock < tsk->vtime_snap)
+ return 0;
+
+ return clock - tsk->vtime_snap;
+}
+
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long delta = vtime_delta(tsk);
+
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ tsk->vtime_snap += delta;
+
+ /* CHECKME: always safe to convert nsecs to cputime? */
+ return nsecs_to_cputime(delta);
+}
+
+static void __vtime_account_system(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ if (context_tracking_in_user())
+ tsk->vtime_snap_whence = VTIME_USER;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_user(struct task_struct *tsk)
+{
+ cputime_t delta_cpu;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ delta_cpu = get_vtime_delta(tsk);
+ tsk->vtime_snap_whence = VTIME_SYS;
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ tsk->vtime_snap_whence = VTIME_USER;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_snap flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags |= PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags &= ~PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_idle_time(delta_cpu);
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+ write_seqlock(&prev->vtime_seqlock);
+ prev->vtime_snap_whence = VTIME_SLEEPING;
+ write_sequnlock(&prev->vtime_seqlock);
+
+ write_seqlock(&current->vtime_seqlock);
+ current->vtime_snap_whence = VTIME_SYS;
+ current->vtime_snap = sched_clock_cpu(smp_processor_id());
+ write_sequnlock(&current->vtime_seqlock);
+}
+
+void vtime_init_idle(struct task_struct *t, int cpu)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ t->vtime_snap_whence = VTIME_SYS;
+ t->vtime_snap = sched_clock_cpu(cpu);
+ write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+
+cputime_t task_gtime(struct task_struct *t)
+{
+ unsigned int seq;
+ cputime_t gtime;
+
+ do {
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ gtime = t->gtime;
+ if (t->flags & PF_VCPU)
+ gtime += vtime_delta(t);
+
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+
+ return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+ cputime_t *u_dst, cputime_t *s_dst,
+ cputime_t *u_src, cputime_t *s_src,
+ cputime_t *udelta, cputime_t *sdelta)
+{
+ unsigned int seq;
+ unsigned long long delta;
+
+ do {
+ *udelta = 0;
+ *sdelta = 0;
+
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ if (u_dst)
+ *u_dst = *u_src;
+ if (s_dst)
+ *s_dst = *s_src;
+
+ /* Task is sleeping, nothing to add */
+ if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(t);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+ *udelta = delta;
+ } else {
+ if (t->vtime_snap_whence == VTIME_SYS)
+ *sdelta = delta;
+ }
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+
+
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utime, stime, &t->utime,
+ &t->stime, &udelta, &sdelta);
+ if (utime)
+ *utime += udelta;
+ if (stime)
+ *stime += sdelta;
+}
+
+void task_cputime_scaled(struct task_struct *t,
+ cputime_t *utimescaled, cputime_t *stimescaled)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utimescaled, stimescaled,
+ &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+ if (utimescaled)
+ *utimescaled += cputime_to_scaled(udelta);
+ if (stimescaled)
+ *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 00000000000..fc4f98b1258
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1676 @@
+/*
+ * Deadline Scheduling Class (SCHED_DEADLINE)
+ *
+ * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
+ *
+ * Tasks that periodically executes their instances for less than their
+ * runtime won't miss any of their deadlines.
+ * Tasks that are not periodic or sporadic or that tries to execute more
+ * than their reserved bandwidth will be slowed down (and may potentially
+ * miss some of their deadlines), and won't affect any other task.
+ *
+ * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
+ * Juri Lelli <juri.lelli@gmail.com>,
+ * Michael Trimarchi <michael@amarulasolutions.com>,
+ * Fabio Checconi <fchecconi@gmail.com>
+ */
+#include "sched.h"
+
+#include <linux/slab.h>
+
+struct dl_bandwidth def_dl_bandwidth;
+
+static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
+{
+ return container_of(dl_se, struct task_struct, dl);
+}
+
+static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
+{
+ return container_of(dl_rq, struct rq, dl);
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->dl;
+}
+
+static inline int on_dl_rq(struct sched_dl_entity *dl_se)
+{
+ return !RB_EMPTY_NODE(&dl_se->rb_node);
+}
+
+static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ return dl_rq->rb_leftmost == &dl_se->rb_node;
+}
+
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+{
+ raw_spin_lock_init(&dl_b->dl_runtime_lock);
+ dl_b->dl_period = period;
+ dl_b->dl_runtime = runtime;
+}
+
+void init_dl_bw(struct dl_bw *dl_b)
+{
+ raw_spin_lock_init(&dl_b->lock);
+ raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
+ if (global_rt_runtime() == RUNTIME_INF)
+ dl_b->bw = -1;
+ else
+ dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
+ raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+ dl_b->total_bw = 0;
+}
+
+void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+{
+ dl_rq->rb_root = RB_ROOT;
+
+#ifdef CONFIG_SMP
+ /* zero means no -deadline tasks */
+ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
+
+ dl_rq->dl_nr_migratory = 0;
+ dl_rq->overloaded = 0;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT;
+#else
+ init_dl_bw(&dl_rq->dl_bw);
+#endif
+}
+
+#ifdef CONFIG_SMP
+
+static inline int dl_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->dlo_count);
+}
+
+static inline void dl_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
+ /*
+ * Must be visible before the overload count is
+ * set (as in sched_rt.c).
+ *
+ * Matched by the barrier in pull_dl_task().
+ */
+ smp_wmb();
+ atomic_inc(&rq->rd->dlo_count);
+}
+
+static inline void dl_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ atomic_dec(&rq->rd->dlo_count);
+ cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
+}
+
+static void update_dl_migration(struct dl_rq *dl_rq)
+{
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+ if (!dl_rq->overloaded) {
+ dl_set_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 1;
+ }
+ } else if (dl_rq->overloaded) {
+ dl_clear_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 0;
+ }
+}
+
+static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory++;
+
+ update_dl_migration(dl_rq);
+}
+
+static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory--;
+
+ update_dl_migration(dl_rq);
+}
+
+/*
+ * The list of pushable -deadline task is not a plist, like in
+ * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
+ */
+static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct task_struct *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct task_struct,
+ pushable_dl_tasks);
+ if (dl_entity_preempt(&p->dl, &entry->dl))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+
+ rb_link_node(&p->pushable_dl_tasks, parent, link);
+ rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+}
+
+static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
+ return;
+
+ if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&p->pushable_dl_tasks);
+ dl_rq->pushable_dl_tasks_leftmost = next_node;
+ }
+
+ rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
+}
+
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+}
+
+static int push_dl_task(struct rq *rq);
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return dl_task(prev);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+ rq->post_schedule = has_pushable_dl_tasks(rq);
+}
+
+#else
+
+static inline
+void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline
+void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_dl_task(struct rq *rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags);
+
+/*
+ * We are being explicitly informed that a new instance is starting,
+ * and this means that:
+ * - the absolute deadline of the entity has to be placed at
+ * current time + relative deadline;
+ * - the runtime of the entity has to be set to the maximum value.
+ *
+ * The capability of specifying such event is useful whenever a -deadline
+ * entity wants to (try to!) synchronize its behaviour with the scheduler's
+ * one, and to (try to!) reconcile itself with its own scheduling
+ * parameters.
+ */
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+
+ /*
+ * We use the regular wall clock time to set deadlines in the
+ * future; in fact, we must consider execution overheads (time
+ * spent on hardirq context, etc.).
+ */
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ dl_se->dl_new = 0;
+}
+
+/*
+ * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+ * possibility of a entity lasting more than what it declared, and thus
+ * exhausting its runtime.
+ *
+ * Here we are interested in making runtime overrun possible, but we do
+ * not want a entity which is misbehaving to affect the scheduling of all
+ * other entities.
+ * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
+ * is used, in order to confine each entity within its own bandwidth.
+ *
+ * This function deals exactly with that, and ensures that when the runtime
+ * of a entity is replenished, its deadline is also postponed. That ensures
+ * the overrunning entity can't interfere with other entity in the system and
+ * can't make them miss their deadlines. Reasons why this kind of overruns
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setscheduler_ex().
+ */
+static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ BUG_ON(pi_se->dl_runtime <= 0);
+
+ /*
+ * This could be the case for a !-dl task that is boosted.
+ * Just go with full inherited parameters.
+ */
+ if (dl_se->dl_deadline == 0) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+
+ /*
+ * We keep moving the deadline away until we get some
+ * available runtime for the entity. This ensures correct
+ * handling of situations where the runtime overrun is
+ * arbitrary large.
+ */
+ while (dl_se->runtime <= 0) {
+ dl_se->deadline += pi_se->dl_period;
+ dl_se->runtime += pi_se->dl_runtime;
+ }
+
+ /*
+ * At this point, the deadline really should be "in
+ * the future" with respect to rq->clock. If it's
+ * not, we are, for some reason, lagging too much!
+ * Anyway, after having warn userspace abut that,
+ * we still try to keep the things running by
+ * resetting the deadline and the budget of the
+ * entity.
+ */
+ if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+ printk_deferred_once("sched: DL replenish lagged to much\n");
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+}
+
+/*
+ * Here we check if --at time t-- an entity (which is probably being
+ * [re]activated or, in general, enqueued) can use its remaining runtime
+ * and its current deadline _without_ exceeding the bandwidth it is
+ * assigned (function returns true if it can't). We are in fact applying
+ * one of the CBS rules: when a task wakes up, if the residual runtime
+ * over residual deadline fits within the allocated bandwidth, then we
+ * can keep the current (absolute) deadline and residual budget without
+ * disrupting the schedulability of the system. Otherwise, we should
+ * refill the runtime and set the deadline a period in the future,
+ * because keeping the current (absolute) deadline of the task would
+ * result in breaking guarantees promised to other tasks (refer to
+ * Documentation/scheduler/sched-deadline.txt for more informations).
+ *
+ * This function returns true if:
+ *
+ * runtime / (deadline - t) > dl_runtime / dl_period ,
+ *
+ * IOW we can't recycle current parameters.
+ *
+ * Notice that the bandwidth check is done against the period. For
+ * task with deadline equal to period this is the same of using
+ * dl_deadline instead of dl_period in the equation above.
+ */
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, u64 t)
+{
+ u64 left, right;
+
+ /*
+ * left and right are the two sides of the equation above,
+ * after a bit of shuffling to use multiplications instead
+ * of divisions.
+ *
+ * Note that none of the time values involved in the two
+ * multiplications are absolute: dl_deadline and dl_runtime
+ * are the relative deadline and the maximum runtime of each
+ * instance, runtime is the runtime left for the last instance
+ * and (deadline - t), since t is rq->clock, is the time left
+ * to the (absolute) deadline. Even if overflowing the u64 type
+ * is very unlikely to occur in both cases, here we scale down
+ * as we want to avoid that risk at all. Scaling down by 10
+ * means that we reduce granularity to 1us. We are fine with it,
+ * since this is only a true/false check and, anyway, thinking
+ * of anything below microseconds resolution is actually fiction
+ * (but still we want to give the user that illusion >;).
+ */
+ left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ right = ((dl_se->deadline - t) >> DL_SCALE) *
+ (pi_se->dl_runtime >> DL_SCALE);
+
+ return dl_time_before(right, left);
+}
+
+/*
+ * When a -deadline entity is queued back on the runqueue, its runtime and
+ * deadline might need updating.
+ *
+ * The policy here is that we update the deadline of the entity only if:
+ * - the current deadline is in the past,
+ * - using the remaining runtime with the current deadline would make
+ * the entity exceed its bandwidth.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * The arrival of a new instance needs special treatment, i.e.,
+ * the actual scheduling parameters have to be "renewed".
+ */
+ if (dl_se->dl_new) {
+ setup_new_dl_entity(dl_se, pi_se);
+ return;
+ }
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+ dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+}
+
+/*
+ * If the entity depleted all its runtime, and if we want it to sleep
+ * while waiting for some new execution time to become available, we
+ * set the bandwidth enforcement timer to the replenishment instant
+ * and try to activate it.
+ *
+ * Notice that it is important for the caller to know if the timer
+ * actually started or not (i.e., the replenishment instant is in
+ * the future or in the past).
+ */
+static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ ktime_t now, act;
+ ktime_t soft, hard;
+ unsigned long range;
+ s64 delta;
+
+ if (boosted)
+ return 0;
+ /*
+ * We want the timer to fire at the deadline, but considering
+ * that it is actually coming from rq->clock and not from
+ * hrtimer's time base reading.
+ */
+ act = ns_to_ktime(dl_se->deadline);
+ now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ delta = ktime_to_ns(now) - rq_clock(rq);
+ act = ktime_add_ns(act, delta);
+
+ /*
+ * If the expiry time already passed, e.g., because the value
+ * chosen as the deadline is too small, don't even try to
+ * start the timer in the past!
+ */
+ if (ktime_us_delta(act, now) < 0)
+ return 0;
+
+ hrtimer_set_expires(&dl_se->dl_timer, act);
+
+ soft = hrtimer_get_softexpires(&dl_se->dl_timer);
+ hard = hrtimer_get_expires(&dl_se->dl_timer);
+ range = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
+ range, HRTIMER_MODE_ABS, 0);
+
+ return hrtimer_active(&dl_se->dl_timer);
+}
+
+/*
+ * This is the bandwidth enforcement timer callback. If here, we know
+ * a task is not on its dl_rq, since the fact that the timer was running
+ * means the task is throttled and needs a runtime replenishment.
+ *
+ * However, what we actually do depends on the fact the task is active,
+ * (it is on its rq) or has been removed from there by a call to
+ * dequeue_task_dl(). In the former case we must issue the runtime
+ * replenishment and add the task back to the dl_rq; in the latter, we just
+ * do nothing but clearing dl_throttled, so that runtime and deadline
+ * updating (and the queueing back to dl_rq) will be done by the
+ * next call to enqueue_task_dl().
+ */
+static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ dl_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq;
+again:
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+
+ if (rq != task_rq(p)) {
+ /* Task was moved, retrying. */
+ raw_spin_unlock(&rq->lock);
+ goto again;
+ }
+
+ /*
+ * We need to take care of a possible races here. In fact, the
+ * task might have changed its scheduling policy to something
+ * different from SCHED_DEADLINE or changed its reservation
+ * parameters (through sched_setattr()).
+ */
+ if (!dl_task(p) || dl_se->dl_new)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ if (p->on_rq) {
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (task_has_dl_policy(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_task(rq->curr);
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
+#endif
+ }
+unlock:
+ raw_spin_unlock(&rq->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->dl_timer;
+
+ if (hrtimer_active(timer)) {
+ hrtimer_try_to_cancel(timer);
+ return;
+ }
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = dl_task_timer;
+}
+
+static
+int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
+ int rorun = dl_se->runtime <= 0;
+
+ if (!rorun && !dmiss)
+ return 0;
+
+ /*
+ * If we are beyond our current deadline and we are still
+ * executing, then we have already used some of the runtime of
+ * the next instance. Thus, if we do not account that, we are
+ * stealing bandwidth from the system at each deadline miss!
+ */
+ if (dmiss) {
+ dl_se->runtime = rorun ? dl_se->runtime : 0;
+ dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
+ }
+
+ return 1;
+}
+
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ u64 delta_exec;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
+
+ sched_rt_avg_update(rq, delta_exec);
+
+ dl_se->runtime -= delta_exec;
+ if (dl_runtime_exceeded(rq, dl_se)) {
+ __dequeue_task_dl(rq, curr, 0);
+ if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
+ dl_se->dl_throttled = 1;
+ else
+ enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+
+ if (!is_leftmost(curr, &rq->dl))
+ resched_task(curr);
+ }
+
+ /*
+ * Because -- for now -- we share the rt bandwidth, we need to
+ * account our runtime there too, otherwise actual rt tasks
+ * would be able to exceed the shared quota.
+ *
+ * Account to the root rt group for now.
+ *
+ * The solution we're working towards is having the RT groups scheduled
+ * using deadline servers -- however there's a few nasties to figure
+ * out before that can happen.
+ */
+ if (rt_bandwidth_enabled()) {
+ struct rt_rq *rt_rq = &rq->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We'll let actual RT tasks worry about the overflow here, we
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
+ */
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+}
+
+#ifdef CONFIG_SMP
+
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
+
+static inline u64 next_deadline(struct rq *rq)
+{
+ struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
+
+ if (next && dl_prio(next->prio))
+ return next->dl.deadline;
+ else
+ return 0;
+}
+
+static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ if (dl_rq->earliest_dl.curr == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ /*
+ * If the dl_rq had no -deadline tasks, or if the new task
+ * has shorter deadline than the current one on dl_rq, we
+ * know that the previous earliest becomes our next earliest,
+ * as the new task becomes the earliest itself.
+ */
+ dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
+ dl_rq->earliest_dl.curr = deadline;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+ } else if (dl_rq->earliest_dl.next == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.next)) {
+ /*
+ * On the other hand, if the new -deadline task has a
+ * a later deadline than the earliest one on dl_rq, but
+ * it is earlier than the next (if any), we must
+ * recompute the next-earliest.
+ */
+ dl_rq->earliest_dl.next = next_deadline(rq);
+ }
+}
+
+static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * Since we may have removed our earliest (and/or next earliest)
+ * task we must recompute them.
+ */
+ if (!dl_rq->dl_nr_running) {
+ dl_rq->earliest_dl.curr = 0;
+ dl_rq->earliest_dl.next = 0;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ } else {
+ struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct sched_dl_entity *entry;
+
+ entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
+ dl_rq->earliest_dl.curr = entry->deadline;
+ dl_rq->earliest_dl.next = next_deadline(rq);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ }
+}
+
+#else
+
+static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+
+#endif /* CONFIG_SMP */
+
+static inline
+void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+ u64 deadline = dl_se->deadline;
+
+ WARN_ON(!dl_prio(prio));
+ dl_rq->dl_nr_running++;
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ inc_dl_deadline(dl_rq, deadline);
+ inc_dl_migration(dl_se, dl_rq);
+}
+
+static inline
+void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+
+ WARN_ON(!dl_prio(prio));
+ WARN_ON(!dl_rq->dl_nr_running);
+ dl_rq->dl_nr_running--;
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ dec_dl_deadline(dl_rq, dl_se->deadline);
+ dec_dl_migration(dl_se, dl_rq);
+}
+
+static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_dl_entity *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_dl_entity, rb_node);
+ if (dl_time_before(dl_se->deadline, entry->deadline))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->rb_leftmost = &dl_se->rb_node;
+
+ rb_link_node(&dl_se->rb_node, parent, link);
+ rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+
+ inc_dl_tasks(dl_se, dl_rq);
+}
+
+static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ if (RB_EMPTY_NODE(&dl_se->rb_node))
+ return;
+
+ if (dl_rq->rb_leftmost == &dl_se->rb_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&dl_se->rb_node);
+ dl_rq->rb_leftmost = next_node;
+ }
+
+ rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ RB_CLEAR_NODE(&dl_se->rb_node);
+
+ dec_dl_tasks(dl_se, dl_rq);
+}
+
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, int flags)
+{
+ BUG_ON(on_dl_rq(dl_se));
+
+ /*
+ * If this is a wakeup or a new instance, the scheduling
+ * parameters of the task might need updating. Otherwise,
+ * we want a replenishment of its runtime.
+ */
+ if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
+ else
+ update_dl_entity(dl_se, pi_se);
+
+ __enqueue_dl_entity(dl_se);
+}
+
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ __dequeue_dl_entity(dl_se);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct sched_dl_entity *pi_se = &p->dl;
+
+ /*
+ * Use the scheduling parameters of the top pi-waiter
+ * task if we have one and its (relative) deadline is
+ * smaller than our one... OTW we keep our runtime and
+ * deadline.
+ */
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ pi_se = &pi_task->dl;
+
+ /*
+ * If p is throttled, we do nothing. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ */
+ if (p->dl.dl_throttled)
+ return;
+
+ enqueue_dl_entity(&p->dl, pi_se, flags);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ dequeue_dl_entity(&p->dl);
+ dequeue_pushable_dl_task(rq, p);
+}
+
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_curr_dl(rq);
+ __dequeue_task_dl(rq, p, flags);
+}
+
+/*
+ * Yield task semantic for -deadline tasks is:
+ *
+ * get off from the CPU until our next instance, with
+ * a new runtime. This is of little use now, since we
+ * don't have a bandwidth reclaiming mechanism. Anyway,
+ * bandwidth reclaiming is planned for the future, and
+ * yield_task_dl will indicate that some spare budget
+ * is available for other task instances to use it.
+ */
+static void yield_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ /*
+ * We make the task go to sleep until its current deadline by
+ * forcing its runtime to zero. This way, update_curr_dl() stops
+ * it and the bandwidth timer will wake it up and will give it
+ * new scheduling parameters (thanks to dl_yielded=1).
+ */
+ if (p->dl.runtime > 0) {
+ rq->curr->dl.dl_yielded = 1;
+ p->dl.runtime = 0;
+ }
+ update_curr_dl(rq);
+}
+
+#ifdef CONFIG_SMP
+
+static int find_later_rq(struct task_struct *task);
+
+static int
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ struct task_struct *curr;
+ struct rq *rq;
+
+ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
+ /*
+ * If we are dealing with a -deadline task, we must
+ * decide where to wake it up.
+ * If it has a later deadline and the current task
+ * on this rq can't move (provided the waking task
+ * can!) we prefer to send it somewhere else. On the
+ * other hand, if it has a shorter deadline, we
+ * try to make it stay here, it might be important.
+ */
+ if (unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ (p->nr_cpus_allowed > 1)) {
+ int target = find_later_rq(p);
+
+ if (target != -1)
+ cpu = target;
+ }
+ rcu_read_unlock();
+
+out:
+ return cpu;
+}
+
+static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ return;
+
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
+ if (p->nr_cpus_allowed != 1 &&
+ cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ return;
+
+ resched_task(rq->curr);
+}
+
+static int pull_dl_task(struct rq *this_rq);
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Only called when both the current and waking task are -deadline
+ * tasks.
+ */
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ resched_task(rq->curr);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * In the unlikely case current and p have the same deadline
+ * let us try to decide what's the best thing to do...
+ */
+ if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ !test_tsk_need_resched(rq->curr))
+ check_preempt_equal_dl(rq, p);
+#endif /* CONFIG_SMP */
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+ s64 delta = p->dl.dl_runtime - p->dl.runtime;
+
+ if (delta > 10000)
+ hrtick_start(rq, p->dl.runtime);
+}
+#endif
+
+static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
+ struct dl_rq *dl_rq)
+{
+ struct rb_node *left = dl_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_dl_entity, rb_node);
+}
+
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p;
+ struct dl_rq *dl_rq;
+
+ dl_rq = &rq->dl;
+
+ if (need_pull_dl_task(rq, prev)) {
+ pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
+ /*
+ * When prev is DL, we may throttle it in put_prev_task().
+ * So, we update time before we check for dl_nr_running.
+ */
+ if (prev->sched_class == &dl_sched_class)
+ update_curr_dl(rq);
+
+ if (unlikely(!dl_rq->dl_nr_running))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ dl_se = pick_next_dl_entity(rq, dl_rq);
+ BUG_ON(!dl_se);
+
+ p = dl_task_of(dl_se);
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* Running task will never be pushed. */
+ dequeue_pushable_dl_task(rq, p);
+
+#ifdef CONFIG_SCHED_HRTICK
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+#endif
+
+ set_post_schedule(rq);
+
+ return p;
+}
+
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+{
+ update_curr_dl(rq);
+
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
+{
+ update_curr_dl(rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+ start_hrtick_dl(rq, p);
+#endif
+}
+
+static void task_fork_dl(struct task_struct *p)
+{
+ /*
+ * SCHED_DEADLINE tasks cannot fork and this is achieved through
+ * sched_fork()
+ */
+}
+
+static void task_dead_dl(struct task_struct *p)
+{
+ struct hrtimer *timer = &p->dl.dl_timer;
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ /*
+ * Since we are TASK_DEAD we won't slip out of the domain!
+ */
+ raw_spin_lock_irq(&dl_b->lock);
+ dl_b->total_bw -= p->dl.dl_bw;
+ raw_spin_unlock_irq(&dl_b->lock);
+
+ hrtimer_cancel(timer);
+}
+
+static void set_curr_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* You can't push away the running task */
+ dequeue_pushable_dl_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define DL_MAX_TRIES 3
+
+static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+ (p->nr_cpus_allowed > 1))
+ return 1;
+
+ return 0;
+}
+
+/* Returns the second earliest -deadline task, NULL otherwise */
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.rb_leftmost;
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p = NULL;
+
+next_node:
+ next_node = rb_next(next_node);
+ if (next_node) {
+ dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
+ p = dl_task_of(dl_se);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ goto next_node;
+ }
+
+ return NULL;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+
+static int find_later_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+ int this_cpu = smp_processor_id();
+ int best_cpu, cpu = task_cpu(task);
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!later_mask))
+ return -1;
+
+ if (task->nr_cpus_allowed == 1)
+ return -1;
+
+ best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
+ task, later_mask);
+ if (best_cpu == -1)
+ return -1;
+
+ /*
+ * If we are here, some target has been found,
+ * the most suitable of which is cached in best_cpu.
+ * This is, among the runqueues where the current tasks
+ * have later deadlines than the task's one, the rq
+ * with the latest possible one.
+ *
+ * Now we check how well this matches with task's
+ * affinity and system topology.
+ *
+ * The last cpu where the task run is our first
+ * guess, since it is most likely cache-hot there.
+ */
+ if (cpumask_test_cpu(cpu, later_mask))
+ return cpu;
+ /*
+ * Check if this_cpu is to be skipped (i.e., it is
+ * not in the mask) or not.
+ */
+ if (!cpumask_test_cpu(this_cpu, later_mask))
+ this_cpu = -1;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+
+ /*
+ * If possible, preempting this_cpu is
+ * cheaper than migrating.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ /*
+ * Last chance: if best_cpu is valid and is
+ * in the mask, that becomes our choice.
+ */
+ if (best_cpu < nr_cpu_ids &&
+ cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * At this point, all our guesses failed, we just return
+ * 'something', and let the caller sort the things out.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any(later_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ return -1;
+}
+
+/* Locks the rq it finds */
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *later_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < DL_MAX_TRIES; tries++) {
+ cpu = find_later_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ later_rq = cpu_rq(cpu);
+
+ /* Retry if something changed. */
+ if (double_lock_balance(rq, later_rq)) {
+ if (unlikely(task_rq(task) != rq ||
+ !cpumask_test_cpu(later_rq->cpu,
+ &task->cpus_allowed) ||
+ task_running(rq, task) || !task->on_rq)) {
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ break;
+ }
+ }
+
+ /*
+ * If the rq we found has no -deadline task, or
+ * its earliest one has a later deadline than our
+ * task, the rq is a good one.
+ */
+ if (!later_rq->dl.dl_nr_running ||
+ dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr))
+ break;
+
+ /* Otherwise we try again. */
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ }
+
+ return later_rq;
+}
+
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ struct task_struct, pushable_dl_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!p->on_rq);
+ BUG_ON(!dl_task(p));
+
+ return p;
+}
+
+/*
+ * See if the non running -deadline tasks on this rq
+ * can be sent to some other CPU where they can preempt
+ * and start executing.
+ */
+static int push_dl_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *later_rq;
+
+ if (!rq->dl.overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_dl_task(rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * If next_task preempts rq->curr, and rq->curr
+ * can move away, it makes sense to just reschedule
+ * without going further in pushing next_task.
+ */
+ if (dl_task(rq->curr) &&
+ dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ rq->curr->nr_cpus_allowed > 1) {
+ resched_task(rq->curr);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* Will lock the rq it'll find */
+ later_rq = find_lock_later_rq(next_task, rq);
+ if (!later_rq) {
+ struct task_struct *task;
+
+ /*
+ * We must check all this again, since
+ * find_lock_later_rq releases rq->lock and it is
+ * then possible that next_task has migrated.
+ */
+ task = pick_next_pushable_dl_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * The task is still there. We don't try
+ * again, some other cpu will pull it when ready.
+ */
+ dequeue_pushable_dl_task(rq, next_task);
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks */
+ goto out;
+
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, later_rq->cpu);
+ activate_task(later_rq, next_task, 0);
+
+ resched_task(later_rq->curr);
+
+ double_unlock_balance(rq, later_rq);
+
+out:
+ put_task_struct(next_task);
+
+ return 1;
+}
+
+static void push_dl_tasks(struct rq *rq)
+{
+ /* Terminates as it moves a -deadline task */
+ while (push_dl_task(rq))
+ ;
+}
+
+static int pull_dl_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct task_struct *p;
+ struct rq *src_rq;
+ u64 dmin = LONG_MAX;
+
+ if (likely(!dl_overloaded(this_rq)))
+ return 0;
+
+ /*
+ * Match the barrier from dl_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the dlo_mask bit.
+ */
+ smp_rmb();
+
+ for_each_cpu(cpu, this_rq->rd->dlo_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+
+ /*
+ * It looks racy, abd it is! However, as in sched_rt.c,
+ * we are fine with this.
+ */
+ if (this_rq->dl.dl_nr_running &&
+ dl_time_before(this_rq->dl.earliest_dl.curr,
+ src_rq->dl.earliest_dl.next))
+ continue;
+
+ /* Might drop this_rq->lock */
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * If there are no more pullable tasks on the
+ * rq, we're done with it.
+ */
+ if (src_rq->dl.dl_nr_running <= 1)
+ goto skip;
+
+ p = pick_next_earliest_dl_task(src_rq, this_cpu);
+
+ /*
+ * We found a task to be pulled if:
+ * - it preempts our current (if there's one),
+ * - it will preempt the last one we pulled (if any).
+ */
+ if (p && dl_time_before(p->dl.deadline, dmin) &&
+ (!this_rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ this_rq->dl.earliest_dl.curr))) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!p->on_rq);
+
+ /*
+ * Then we pull iff p has actually an earlier
+ * deadline than the current task of its runqueue.
+ */
+ if (dl_time_before(p->dl.deadline,
+ src_rq->curr->dl.deadline))
+ goto skip;
+
+ ret = 1;
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+
+ /* Is there any other task even earlier? */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ return ret;
+}
+
+static void post_schedule_dl(struct rq *rq)
+{
+ push_dl_tasks(rq);
+}
+
+/*
+ * Since the task is not running and a reschedule is not going to happen
+ * anytime soon on its runqueue, we try pushing it away now.
+ */
+static void task_woken_dl(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ has_pushable_dl_tasks(rq) &&
+ p->nr_cpus_allowed > 1 &&
+ dl_task(rq->curr) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ push_dl_tasks(rq);
+ }
+}
+
+static void set_cpus_allowed_dl(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ struct rq *rq;
+ int weight;
+
+ BUG_ON(!dl_task(p));
+
+ /*
+ * Update only if the task is actually running (i.e.,
+ * it is on the rq AND it is not throttled).
+ */
+ if (!on_dl_rq(&p->dl))
+ return;
+
+ weight = cpumask_weight(new_mask);
+
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
+ return;
+
+ rq = task_rq(p);
+
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_dl_task(rq, p);
+ BUG_ON(!rq->dl.dl_nr_migratory);
+ rq->dl.dl_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_dl_task(rq, p);
+ rq->dl.dl_nr_migratory++;
+ }
+
+ update_dl_migration(&rq->dl);
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_set_overload(rq);
+
+ if (rq->dl.dl_nr_running > 0)
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_clear_overload(rq);
+
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+}
+
+void init_sched_dl_class(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
+ GFP_KERNEL, cpu_to_node(i));
+}
+
+#endif /* CONFIG_SMP */
+
+static void switched_from_dl(struct rq *rq, struct task_struct *p)
+{
+ if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
+
+#ifdef CONFIG_SMP
+ /*
+ * Since this might be the only -deadline task on the rq,
+ * this is the right place to try to pull some other one
+ * from an overloaded cpu, if any.
+ */
+ if (!rq->dl.dl_nr_running)
+ pull_dl_task(rq);
+#endif
+}
+
+/*
+ * When switching to -deadline, we may overload the rq, then
+ * we try to push someone off, if possible.
+ */
+static void switched_to_dl(struct rq *rq, struct task_struct *p)
+{
+ int check_resched = 1;
+
+ /*
+ * If p is throttled, don't consider the possibility
+ * of preempting rq->curr, the check will be done right
+ * after its runtime will get replenished.
+ */
+ if (unlikely(p->dl.dl_throttled))
+ return;
+
+ if (p->on_rq && rq->curr != p) {
+#ifdef CONFIG_SMP
+ if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ /* Only reschedule if pushing failed */
+ check_resched = 0;
+#endif /* CONFIG_SMP */
+ if (check_resched && task_has_dl_policy(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ }
+}
+
+/*
+ * If the scheduling parameters of a -deadline task changed,
+ * a push or pull operation might be needed.
+ */
+static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+ int oldprio)
+{
+ if (p->on_rq || rq->curr == p) {
+#ifdef CONFIG_SMP
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ pull_dl_task(rq);
+
+ /*
+ * If we now have a earlier deadline task than p,
+ * then reschedule, provided p is still on this
+ * runqueue.
+ */
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
+ rq->curr == p)
+ resched_task(p);
+#else
+ /*
+ * Again, we don't know if p has a earlier
+ * or later deadline, so let's blindly set a
+ * (maybe not needed) rescheduling point.
+ */
+ resched_task(p);
+#endif /* CONFIG_SMP */
+ } else
+ switched_to_dl(rq, p);
+}
+
+const struct sched_class dl_sched_class = {
+ .next = &rt_sched_class,
+ .enqueue_task = enqueue_task_dl,
+ .dequeue_task = dequeue_task_dl,
+ .yield_task = yield_task_dl,
+
+ .check_preempt_curr = check_preempt_curr_dl,
+
+ .pick_next_task = pick_next_task_dl,
+ .put_prev_task = put_prev_task_dl,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_dl,
+ .set_cpus_allowed = set_cpus_allowed_dl,
+ .rq_online = rq_online_dl,
+ .rq_offline = rq_offline_dl,
+ .post_schedule = post_schedule_dl,
+ .task_woken = task_woken_dl,
+#endif
+
+ .set_curr_task = set_curr_task_dl,
+ .task_tick = task_tick_dl,
+ .task_fork = task_fork_dl,
+ .task_dead = task_dead_dl,
+
+ .prio_changed = prio_changed_dl,
+ .switched_from = switched_from_dl,
+ .switched_to = switched_to_dl,
+};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a075e10004..627b3c34b82 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+#include <linux/mempolicy.h>
#include "sched.h"
@@ -61,14 +62,20 @@ static unsigned long nsec_low(unsigned long long nsec)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
- if (!se)
- return;
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->runnable_avg_period);
+ return;
+ }
+
+
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
@@ -85,6 +92,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->statistics.wait_count);
#endif
P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.runnable_avg_period);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.decay_count);
+#endif
#undef PN
#undef P
}
@@ -98,15 +111,7 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- /*
- * May be NULL if the underlying cgroup isn't fully-created yet
- */
- if (!tg->css.cgroup) {
- group_path[0] = '\0';
- return group_path;
- }
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
- return group_path;
+ return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
}
#endif
@@ -119,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, " ");
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
- p->comm, p->pid,
+ p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -132,6 +137,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ SEQ_printf(m, " %d", task_node(p));
+#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
#endif
@@ -154,7 +162,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
- if (!p->on_rq || task_cpu(p) != rq_cpu)
+ if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -202,20 +210,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
- SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
- SPLIT_NS(cfs_rq->load_avg));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
- SPLIT_NS(cfs_rq->load_period));
- SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
- cfs_rq->load_contribution);
- SEQ_printf(m, " .%-30s: %d\n", "load_tg",
- atomic_read(&cfs_rq->tg->load_weight));
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic_long_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+#endif
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+ cfs_rq->tg->cfs_bandwidth.timer_active);
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
@@ -253,15 +275,21 @@ static void print_cpu(struct seq_file *m, int cpu)
{
unsigned int freq = cpu_khz ? : 1;
- SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
#else
- SEQ_printf(m, "\ncpu#%d\n", cpu);
+ SEQ_printf(m, "cpu#%d\n", cpu);
#endif
-#define P(x) \
- SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define P(x) \
+do { \
+ if (sizeof(rq->x) == 4) \
+ SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ else \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
+
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
@@ -272,7 +300,7 @@ static void print_cpu(struct seq_file *m, int cpu)
P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
- P(curr->pid);
+ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
P(cpu_load[0]);
P(cpu_load[1]);
@@ -288,11 +316,11 @@ static void print_cpu(struct seq_file *m, int cpu)
P(yld_count);
- P(sched_switch);
P(sched_count);
P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
+ P64(max_idle_balance_cost);
#endif
P(ttwu_count);
@@ -309,6 +337,7 @@ static void print_cpu(struct seq_file *m, int cpu)
print_rq(m, rq, cpu);
rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
@@ -317,11 +346,10 @@ static const char *sched_tunable_scaling_names[] = {
"linear"
};
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
{
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
- int cpu;
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
@@ -329,7 +357,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
cpu_clk = local_clock();
local_irq_restore(flags);
- SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+ SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@@ -343,7 +371,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
PN(cpu_clk);
P(jiffies);
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- P(sched_clock_stable);
+ P(sched_clock_stable());
#endif
#undef PN
#undef P
@@ -363,33 +391,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P
- SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
sysctl_sched_tunable_scaling,
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
- for_each_online_cpu(cpu)
- print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
- SEQ_printf(m, "\n");
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
return 0;
}
void sysrq_sched_debug_show(void)
{
- sched_debug_show(NULL, NULL);
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
}
static int sched_debug_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_debug_show, NULL);
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
}
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = sched_debug_release,
};
static int __init init_sched_debug_procfs(void)
@@ -404,22 +500,73 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
+#define __P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ struct mempolicy *pol;
+ int node, i;
+
+ if (p->mm)
+ P(mm->numa_scan_seq);
+
+ task_lock(p);
+ pol = p->mempolicy;
+ if (pol && !(pol->flags & MPOL_F_MORON))
+ pol = NULL;
+ mpol_get(pol);
+ task_unlock(p);
+
+ SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+
+ for_each_online_node(node) {
+ for (i = 0; i < 2; i++) {
+ unsigned long nr_faults = -1;
+ int cpu_current, home_node;
+
+ if (p->numa_faults_memory)
+ nr_faults = p->numa_faults_memory[2*node + i];
+
+ cpu_current = !i ? (task_node(p) == node) :
+ (pol && node_isset(node, pol->v.nodes));
+
+ home_node = (p->numa_preferred_nid == node);
+
+ SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
+ i, node, cpu_current, home_node, nr_faults);
+ }
+ }
+
+ mpol_put(pol);
+#endif
+}
+
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
get_nr_threads(p));
SEQ_printf(m,
- "---------------------------------------------------------\n");
+ "---------------------------------------------------------"
+ "----------\n");
#define __P(F) \
- SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
- SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
- SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
- SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
PN(se.exec_start);
PN(se.vruntime);
@@ -461,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
- do_div(avg_atom, nr_switches);
+ avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;
@@ -478,12 +625,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
}
#endif
__P(nr_switches);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"nr_voluntary_switches", (long long)p->nvcsw);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+#ifdef CONFIG_SMP
+ P(se.avg.runnable_avg_sum);
+ P(se.avg.runnable_avg_period);
+ P(se.avg.load_avg_contrib);
+ P(se.avg.decay_count);
+#endif
P(policy);
P(prio);
#undef PN
@@ -497,9 +650,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
- SEQ_printf(m, "%-35s:%21Ld\n",
+ SEQ_printf(m, "%-45s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
+
+ sched_show_numa(p, m);
}
void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669..fea7d3335e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>
@@ -110,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
@@ -157,59 +178,61 @@ void sched_init_granularity(void)
update_sysctl();
}
-#if BITS_PER_LONG == 32
-# define WMULT_CONST (~0UL)
-#else
-# define WMULT_CONST (1UL << 32)
-#endif
-
+#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
/*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
- u64 tmp;
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
- /*
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- * 2^SCHED_LOAD_RESOLUTION.
- */
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- tmp = (u64)delta_exec * scale_load_down(weight);
- else
- tmp = (u64)delta_exec;
+ __update_inv_weight(lw);
- if (!lw->inv_weight) {
- unsigned long w = scale_load_down(lw->weight);
-
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- lw->inv_weight = 1;
- else if (unlikely(!w))
- lw->inv_weight = WMULT_CONST;
- else
- lw->inv_weight = WMULT_CONST / w;
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
}
- /*
- * Check whether we'd overflow the 64-bit multiplication:
- */
- if (unlikely(tmp > WMULT_CONST))
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- WMULT_SHIFT/2);
- else
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+
+ return mul_u64_u32_shr(delta_exec, fact, shift);
}
@@ -259,6 +282,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update);
+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -278,6 +304,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
+ /* We should have no load, but we need to update last_decay. */
+ update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -294,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;
- return 0;
+ return NULL;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -332,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
*/
/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
@@ -398,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- return 1;
-}
-
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
@@ -416,20 +427,20 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec);
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
+ s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
- min_vruntime = vruntime;
+ max_vruntime = vruntime;
- return min_vruntime;
+ return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -465,6 +476,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
vruntime = min_vruntime(vruntime, se->vruntime);
}
+ /* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -585,11 +597,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
/*
* delta /= w
*/
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
@@ -597,7 +608,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
/*
* The idea is to set a period in which each task runs once.
*
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
@@ -638,13 +649,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&lw, se->load.weight);
load = &lw;
}
- slice = calc_delta_mine(slice, se->load.weight, load);
+ slice = __calc_delta(slice, se->load.weight, load);
}
return slice;
}
/*
- * We calculate the vruntime slice of a to be inserted task
+ * We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
@@ -653,55 +664,55 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
- unsigned long delta_exec)
-{
- unsigned long delta_exec_weighted;
-
- schedstat_set(curr->statistics.exec_max,
- max((u64)delta_exec, curr->statistics.exec_max));
+#ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+static inline void __update_task_entity_contrib(struct sched_entity *se);
- curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
+/* Give new task start runnable values to heavy its load in infant time */
+void init_task_runnable_average(struct task_struct *p)
+{
+ u32 slice;
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
+ p->se.avg.decay_count = 0;
+ slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+ p->se.avg.runnable_avg_sum = slice;
+ p->se.avg.runnable_avg_period = slice;
+ __update_task_entity_contrib(&p->se);
+}
+#else
+void init_task_runnable_average(struct task_struct *p)
+{
}
+#endif
+/*
+ * Update the current task's runtime statistics.
+ */
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock_task;
- unsigned long delta_exec;
+ u64 now = rq_clock_task(rq_of(cfs_rq));
+ u64 delta_exec;
if (unlikely(!curr))
return;
- /*
- * Get the amount of time the current task was running
- * since the last time we changed load (this cannot
- * overflow on 32 bits):
- */
- delta_exec = (unsigned long)(now - curr->exec_start);
- if (!delta_exec)
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
return;
- __update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
+ schedstat_set(curr->statistics.exec_max,
+ max(delta_exec, curr->statistics.exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+ curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_min_vruntime(cfs_rq);
+
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
@@ -716,7 +727,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
/*
@@ -736,14 +747,14 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->statistics.wait_start);
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
}
#endif
schedstat_set(se->statistics.wait_start, 0);
@@ -769,119 +780,1292 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock_task;
+ se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/**************************************************
* Scheduling class queueing methods:
*/
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
{
- cfs_rq->task_weight += weight;
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
}
-#endif
-static void
-account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static unsigned int task_scan_max(struct task_struct *p)
{
- update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- add_cfs_task_weight(cfs_rq, se->load.weight);
- list_add(&se->group_node, &cfs_rq->tasks);
+ unsigned int smin = task_scan_min(p);
+ unsigned int smax;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+ return max(smin, smax);
+}
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ struct list_head task_list;
+
+ struct rcu_head rcu;
+ nodemask_t active_nodes;
+ unsigned long total_faults;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+ return p->numa_group ? p->numa_group->gid : 0;
+}
+
+static inline int task_faults_idx(int nid, int priv)
+{
+ return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults_memory)
+ return 0;
+
+ return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+ p->numa_faults_memory[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_group)
+ return 0;
+
+ return p->numa_group->faults[task_faults_idx(nid, 0)] +
+ p->numa_group->faults[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+ return group->faults_cpu[task_faults_idx(nid, 0)] +
+ group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node. The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+ unsigned long total_faults;
+
+ if (!p->numa_faults_memory)
+ return 0;
+
+ total_faults = p->total_numa_faults;
+
+ if (!total_faults)
+ return 0;
+
+ return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+ if (!p->numa_group || !p->numa_group->total_faults)
+ return 0;
+
+ return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+{
+ struct numa_group *ng = p->numa_group;
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By using
+ * a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+ * a task's usage of a particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and getting the
+ * same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely we
+ * act on an unlikely task<->page relation.
+ */
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) &&
+ cpupid_to_nid(last_cpupid) != dst_nid)
+ return false;
+
+ /* Always allow migrate on private faults */
+ if (cpupid_match_pid(p, last_cpupid))
+ return true;
+
+ /* A shared fault, but p->numa_group has not been set up yet. */
+ if (!ng)
+ return true;
+
+ /*
+ * Do not migrate if the destination is not a node that
+ * is actively used by this numa group.
+ */
+ if (!node_isset(dst_nid, ng->active_nodes))
+ return false;
+
+ /*
+ * Source is a node that is not actively used by this
+ * numa group, while the destination is. Migrate.
+ */
+ if (!node_isset(src_nid, ng->active_nodes))
+ return true;
+
+ /*
+ * Both source and destination are nodes in active
+ * use by this numa group. Maximize memory bandwidth
+ * by migrating from more heavily used groups, to less
+ * heavily used ones, spreading the load around.
+ * Use a 1/4 hysteresis to avoid spurious page movement.
+ */
+ return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long capacity_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+ unsigned long nr_running;
+ unsigned long load;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long compute_capacity;
+
+ /* Approximate capacity in terms of runnable tasks on a node */
+ unsigned long task_capacity;
+ int has_free_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int cpu, cpus = 0;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(cpu);
+ ns->compute_capacity += capacity_of(cpu);
+
+ cpus++;
}
- cfs_rq->nr_running++;
+
+ /*
+ * If we raced with hotplug and there are no CPUs left in our mask
+ * the @ns structure is NULL'ed and task_numa_compare() will
+ * not find this node attractive.
+ *
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
+ */
+ if (!cpus)
+ return;
+
+ ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
+ ns->task_capacity =
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
-static void
-account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+
+ struct numa_stats src_stats, dst_stats;
+
+ int imbalance_pct;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
{
- update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- add_cfs_task_weight(cfs_rq, -se->load.weight);
- list_del_init(&se->group_node);
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+ long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long imb, old_imb;
+
+ /* We care about the slope of the imbalance, not the direction. */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ /* Is the difference below the threshold? */
+ imb = dst_load * 100 - src_load * env->imbalance_pct;
+ if (imb <= 0)
+ return false;
+
+ /*
+ * The imbalance is above the allowed threshold.
+ * Compare it with the old imbalance.
+ */
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
+
+ old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ struct task_struct *cur;
+ long orig_src_load, src_load;
+ long orig_dst_load, dst_load;
+ long load;
+ long imp = (groupimp > 0) ? groupimp : taskimp;
+
+ rcu_read_lock();
+ cur = ACCESS_ONCE(dst_rq->curr);
+ if (cur->pid == 0) /* idle */
+ cur = NULL;
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more rmeote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ if (cur) {
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ goto unlock;
+
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+ if (cur->numa_group)
+ imp -= imp/16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by
+ * itself (not part of a group), use the task weight
+ * instead.
+ */
+ if (env->p->numa_group)
+ imp = groupimp;
+ else
+ imp = taskimp;
+
+ if (cur->numa_group)
+ imp += group_weight(cur, env->src_nid) -
+ group_weight(cur, env->dst_nid);
+ else
+ imp += task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ }
}
- cfs_rq->nr_running--;
+
+ if (imp < env->best_imp)
+ goto unlock;
+
+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.has_free_capacity &&
+ !env->dst_stats.has_free_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ orig_dst_load = env->dst_stats.load;
+ orig_src_load = env->src_stats.load;
+
+ /* XXX missing capacity terms */
+ load = task_h_load(env->p);
+ dst_load = orig_dst_load + load;
+ src_load = orig_src_load - load;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ if (load_too_imbalanced(orig_src_load, orig_dst_load,
+ src_load, dst_load, env))
+ goto unlock;
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- int global_update)
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
{
- struct task_group *tg = cfs_rq->tg;
- long load_avg;
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
+ };
+ struct sched_domain *sd;
+ unsigned long taskweight, groupweight;
+ int nid, ret;
+ long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ if (sd)
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ /*
+ * Cpusets can break the scheduler domain tree into smaller
+ * balance domains, some of which do not cross NUMA boundaries.
+ * Tasks that are "trapped" in such domains cannot be migrated
+ * elsewhere, so there is no point in (re)trying.
+ */
+ if (unlikely(!sd)) {
+ p->numa_preferred_nid = task_node(p);
+ return -EINVAL;
+ }
+
+ taskweight = task_weight(p, env.src_nid);
+ groupweight = group_weight(p, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ env.dst_nid = p->numa_preferred_nid;
+ taskimp = task_weight(p, env.dst_nid) - taskweight;
+ groupimp = group_weight(p, env.dst_nid) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* If the preferred nid has free capacity, try to use it. */
+ if (env.dst_stats.has_free_capacity)
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /* No space available on the preferred nid. Look elsewhere. */
+ if (env.best_cpu == -1) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid) - taskweight;
+ groupimp = group_weight(p, nid) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
+
+ /*
+ * Reset the scan period if the task is being rescheduled on an
+ * alternative node to recheck if the tasks is now properly placed.
+ */
+ p->numa_scan_period = task_scan_min(p);
+
+ if (env.best_task == NULL) {
+ ret = migrate_task_to(p, env.best_cpu);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ unsigned long interval = HZ;
+
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ return;
+
+ /* Periodically retry migrating the task to the preferred node */
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
+
+ /* Success if task is already running on preferred CPU */
+ if (task_node(p) == p->numa_preferred_nid)
+ return;
+
+ /* Otherwise, try migrate to a CPU on the preferred node */
+ task_numa_migrate(p);
+}
+
+/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+ unsigned long faults, max_faults = 0;
+ int nid;
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults > max_faults)
+ max_faults = faults;
+ }
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (!node_isset(nid, numa_group->active_nodes)) {
+ if (faults > max_faults * 6 / 16)
+ node_set(nid, numa_group->active_nodes);
+ } else if (faults < max_faults * 3 / 16)
+ node_clear(nid, numa_group->active_nodes);
+ }
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Scan slower
+ */
+ if (local + shared == 0) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
+
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+ /*
+ * Scale scan rate increases based on sharing. There is an
+ * inverse relationship between the degree of sharing and
+ * the adjustment made to the scanning period. Broadly
+ * speaking the intent is that there is little point
+ * scanning faster if shared accesses dominate as it may
+ * simply bounce migrations uselessly
+ */
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+ u64 runtime, delta, now;
+ /* Use the start of this time slice to avoid calculations. */
+ now = p->se.exec_start;
+ runtime = p->se.sum_exec_runtime;
+
+ if (p->last_task_numa_placement) {
+ delta = runtime - p->last_sum_exec_runtime;
+ *period = now - p->last_task_numa_placement;
+ } else {
+ delta = p->se.avg.runnable_avg_sum;
+ *period = p->se.avg.runnable_avg_period;
+ }
+
+ p->last_sum_exec_runtime = runtime;
+ p->last_task_numa_placement = now;
+
+ return delta;
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq, nid, max_nid = -1, max_group_nid = -1;
+ unsigned long max_faults = 0, max_group_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
+ unsigned long total_faults;
+ u64 runtime, period;
+ spinlock_t *group_lock = NULL;
+
+ seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+
+ total_faults = p->numa_faults_locality[0] +
+ p->numa_faults_locality[1];
+ runtime = numa_get_avg_runtime(p, &period);
+
+ /* If the task is part of a group prevent parallel updates to group stats */
+ if (p->numa_group) {
+ group_lock = &p->numa_group->lock;
+ spin_lock_irq(group_lock);
+ }
+
+ /* Find the node with the highest number of faults */
+ for_each_online_node(nid) {
+ unsigned long faults = 0, group_faults = 0;
+ int priv, i;
+
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+ long diff, f_diff, f_weight;
+
+ i = task_faults_idx(nid, priv);
+
+ /* Decay existing window, copy faults since last scan */
+ diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
+ fault_types[priv] += p->numa_faults_buffer_memory[i];
+ p->numa_faults_buffer_memory[i] = 0;
+
+ /*
+ * Normalize the faults_from, so all tasks in a group
+ * count according to CPU use, instead of by the raw
+ * number of faults. Tasks with little runtime have
+ * little over-all impact on throughput, and thus their
+ * faults are less important.
+ */
+ f_weight = div64_u64(runtime << 16, period + 1);
+ f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ (total_faults + 1);
+ f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+ p->numa_faults_buffer_cpu[i] = 0;
+
+ p->numa_faults_memory[i] += diff;
+ p->numa_faults_cpu[i] += f_diff;
+ faults += p->numa_faults_memory[i];
+ p->total_numa_faults += diff;
+ if (p->numa_group) {
+ /* safe because we can only change our own group */
+ p->numa_group->faults[i] += diff;
+ p->numa_group->faults_cpu[i] += f_diff;
+ p->numa_group->total_faults += diff;
+ group_faults += p->numa_group->faults[i];
+ }
+ }
+
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+
+ if (group_faults > max_group_faults) {
+ max_group_faults = group_faults;
+ max_group_nid = nid;
+ }
+ }
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+
+ if (p->numa_group) {
+ update_numa_active_node_mask(p->numa_group);
+ /*
+ * If the preferred task and group nids are different,
+ * iterate over the nodes again to find the best place.
+ */
+ if (max_nid != max_group_nid) {
+ unsigned long weight, max_weight = 0;
+
+ for_each_online_node(nid) {
+ weight = task_weight(p, nid) + group_weight(p, nid);
+ if (weight > max_weight) {
+ max_weight = weight;
+ max_nid = nid;
+ }
+ }
+ }
+
+ spin_unlock_irq(group_lock);
+ }
+
+ /* Preferred node as the node with the most faults */
+ if (max_faults && max_nid != p->numa_preferred_nid) {
+ /* Update the preferred nid and migrate task if possible */
+ sched_setnuma(p, max_nid);
+ numa_migrate_preferred(p);
+ }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+ return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+ if (atomic_dec_and_test(&grp->refcount))
+ kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+ int *priv)
+{
+ struct numa_group *grp, *my_grp;
+ struct task_struct *tsk;
+ bool join = false;
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+ if (unlikely(!p->numa_group)) {
+ unsigned int size = sizeof(struct numa_group) +
+ 4*nr_node_ids*sizeof(unsigned long);
+
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!grp)
+ return;
+
+ atomic_set(&grp->refcount, 1);
+ spin_lock_init(&grp->lock);
+ INIT_LIST_HEAD(&grp->task_list);
+ grp->gid = p->pid;
+ /* Second half of the array tracks nids where faults happen */
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+ nr_node_ids;
+
+ node_set(task_node(current), grp->active_nodes);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults_memory[i];
+
+ grp->total_faults = p->total_numa_faults;
+
+ list_add(&p->numa_entry, &grp->task_list);
+ grp->nr_tasks++;
+ rcu_assign_pointer(p->numa_group, grp);
+ }
- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- load_avg -= cfs_rq->load_contribution;
+ rcu_read_lock();
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+ if (!cpupid_match_pid(tsk, cpupid))
+ goto no_join;
+
+ grp = rcu_dereference(tsk->numa_group);
+ if (!grp)
+ goto no_join;
+
+ my_grp = p->numa_group;
+ if (grp == my_grp)
+ goto no_join;
+
+ /*
+ * Only join the other group if its bigger; if we're the bigger group,
+ * the other task will join us.
+ */
+ if (my_grp->nr_tasks > grp->nr_tasks)
+ goto no_join;
+
+ /*
+ * Tie-break on the grp address.
+ */
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+ goto no_join;
+
+ /* Always join threads in the same process. */
+ if (tsk->mm == current->mm)
+ join = true;
+
+ /* Simple filter to avoid false positives due to PID collisions */
+ if (flags & TNF_SHARED)
+ join = true;
+
+ /* Update priv based on whether false sharing was detected */
+ *priv = !join;
+
+ if (join && !get_numa_group(grp))
+ goto no_join;
- if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- atomic_add(load_avg, &tg->load_weight);
- cfs_rq->load_contribution += load_avg;
+ rcu_read_unlock();
+
+ if (!join)
+ return;
+
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] += p->numa_faults_memory[i];
}
+ my_grp->total_faults -= p->total_numa_faults;
+ grp->total_faults += p->total_numa_faults;
+
+ list_move(&p->numa_entry, &grp->task_list);
+ my_grp->nr_tasks--;
+ grp->nr_tasks++;
+
+ spin_unlock(&my_grp->lock);
+ spin_unlock_irq(&grp->lock);
+
+ rcu_assign_pointer(p->numa_group, grp);
+
+ put_numa_group(my_grp);
+ return;
+
+no_join:
+ rcu_read_unlock();
+ return;
}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+void task_numa_free(struct task_struct *p)
{
- u64 period = sysctl_sched_shares_window;
- u64 now, delta;
- unsigned long load = cfs_rq->load.weight;
+ struct numa_group *grp = p->numa_group;
+ void *numa_faults = p->numa_faults_memory;
+ unsigned long flags;
+ int i;
+
+ if (grp) {
+ spin_lock_irqsave(&grp->lock, flags);
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults_memory[i];
+ grp->total_faults -= p->total_numa_faults;
+
+ list_del(&p->numa_entry);
+ grp->nr_tasks--;
+ spin_unlock_irqrestore(&grp->lock, flags);
+ rcu_assign_pointer(p->numa_group, NULL);
+ put_numa_group(grp);
+ }
+
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->numa_faults_cpu= NULL;
+ p->numa_faults_buffer_cpu = NULL;
+ kfree(numa_faults);
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
+{
+ struct task_struct *p = current;
+ bool migrated = flags & TNF_MIGRATED;
+ int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
+ int priv;
+
+ if (!numabalancing_enabled)
+ return;
+
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
- if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
+ /* Do not worry about placement if exiting */
+ if (p->state == TASK_DEAD)
return;
- now = rq_of(cfs_rq)->clock_task;
- delta = now - cfs_rq->load_stamp;
+ /* Allocate buffer to track faults on a per-node basis */
+ if (unlikely(!p->numa_faults_memory)) {
+ int size = sizeof(*p->numa_faults_memory) *
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
+
+ p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults_memory)
+ return;
- /* truncate load history at 4 idle periods */
- if (cfs_rq->load_stamp > cfs_rq->load_last &&
- now - cfs_rq->load_last > 4 * period) {
- cfs_rq->load_period = 0;
- cfs_rq->load_avg = 0;
- delta = period - 1;
+ BUG_ON(p->numa_faults_buffer_memory);
+ /*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+ p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+ p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
+ p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
- cfs_rq->load_stamp = now;
- cfs_rq->load_unacc_exec_time = 0;
- cfs_rq->load_period += delta;
- if (load) {
- cfs_rq->load_last = now;
- cfs_rq->load_avg += delta * load;
+ /*
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
+ */
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+ priv = 1;
+ } else {
+ priv = cpupid_match_pid(p, last_cpupid);
+ if (!priv && !(flags & TNF_NO_GROUP))
+ task_numa_group(p, last_cpupid, flags, &priv);
}
- /* consider updating load contribution on each fold or truncate */
- if (global_update || cfs_rq->load_period > period
- || !cfs_rq->load_period)
- update_cfs_rq_load_contribution(cfs_rq, global_update);
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ if (!priv && !local && p->numa_group &&
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
+ node_isset(mem_node, p->numa_group->active_nodes))
+ local = 1;
+
+ task_numa_placement(p);
+
+ /*
+ * Retry task to preferred node migration periodically, in case it
+ * case it previously failed, or the scheduler moved us.
+ */
+ if (time_after(jiffies, p->numa_migrate_retry))
+ numa_migrate_preferred(p);
+
+ if (migrated)
+ p->numa_pages_migrated += pages;
+
+ p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+ p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults_locality[local] += pages;
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
+ long pages;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ if (!mm->numa_next_scan) {
+ mm->numa_next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_min(p);
+ }
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Delay this task enough that another task of this mm will likely win
+ * the next time around.
+ */
+ p->node_stamp += 2 * TICK_NSEC;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ if (!pages)
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ continue;
+
+ /*
+ * Shared library pages mapped by multiple processes are not
+ * migrated as it is expected they are cache replicated. Avoid
+ * hinting faults in read-only file-backed mappings or the vdso
+ * as migrating the pages will be of marginal benefit.
+ */
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ continue;
- while (cfs_rq->load_period > period) {
/*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
*/
- asm("" : "+rm" (cfs_rq->load_period));
- cfs_rq->load_period /= 2;
- cfs_rq->load_avg /= 2;
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ nr_pte_updates += change_prot_numa(vma, start, end);
+
+ /*
+ * Scan sysctl_numa_balancing_scan_size but ensure that
+ * at least one PTE is updated so that unused virtual
+ * address space is quickly skipped.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
+
+ start = end;
+ if (pages <= 0)
+ goto out;
+
+ cond_resched();
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few
+ * VMAs are not guaranteed to the vma_migratable. If they are not, we
+ * would find the !migratable VMA on the next scan but not reset the
+ * scanner to the start so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = task_scan_min(curr);
+ curr->node_stamp += period;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
}
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
- if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- list_del_leaf_cfs_rq(cfs_rq);
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
}
+#endif /* CONFIG_NUMA_BALANCING */
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
+ cfs_rq->nr_running++;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ list_del_init(&se->group_node);
+ }
+ cfs_rq->nr_running--;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
@@ -891,8 +2075,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
- tg_weight = atomic_read(&tg->load_weight);
- tg_weight -= cfs_rq->load_contribution;
+ tg_weight = atomic_long_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -916,27 +2100,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return shares;
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq);
- }
-}
# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
@@ -954,6 +2122,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
@@ -973,19 +2143,513 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_SMP
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ * With a look-up table which covers k^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
+{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+ struct sched_avg *sa,
+ int runnable)
{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->runnable_avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ sa->runnable_avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ sa->runnable_avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ sa->runnable_avg_period += delta;
+
+ return decayed;
}
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.decay_count = 0;
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic_long_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib;
+
+ /* The fraction of a cpu used by this cfs_rq */
+ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+ }
}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div_u64(contrib,
+ atomic_long_read(&tg->load_avg) + 1);
+
+ /*
+ * For group entities we need to compute a correction term in the case
+ * that they are consuming <1 cpu so that we would contribute the same
+ * load as a task of equal weight.
+ *
+ * Explicitly co-ordinating this measurement would be expensive, but
+ * fortunately the sum of each cpus contribution forms a usable
+ * lower-bound on the true value.
+ *
+ * Consider the aggregate of 2 contributions. Either they are disjoint
+ * (and the sum represents true value) or they are disjoint and we are
+ * understating by the aggregate of their overlap.
+ *
+ * Extending this to N cpus, for a given overlap, the maximum amount we
+ * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ * cpus that overlap for this interval and w_i is the interval width.
+ *
+ * On a small machine; the first term is well-bounded which bounds the
+ * total error since w_i is a subset of the period. Whereas on a
+ * larger machine, while this first term can be larger, if w_i is the
+ * of consequential size guaranteed to see n_i*w_i quickly converge to
+ * our upper bound of 1-cpu.
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < NICE_0_LOAD) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ }
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta;
+ u64 now;
+
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq)
+ cfs_rq->runnable_load_avg += contrib_delta;
+ else
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic_long_read(&cfs_rq->removed_load)) {
+ unsigned long removed_load;
+ removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ *
+ * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
+ * are seen by enqueue_entity_load_avg() as a migration with an already
+ * constructed load_avg_contrib.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ /* Indicate that we're now synchronized and on-rq */
+ se->avg.decay_count = 0;
+ }
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
+static int idle_balance(struct rq *this_rq);
+
+#else /* CONFIG_SMP */
+
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+
+static inline int idle_balance(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -995,7 +2659,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
tsk = task_of(se);
if (se->statistics.sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
@@ -1003,6 +2667,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (unlikely(delta > se->statistics.sleep_max))
se->statistics.sleep_max = delta;
+ se->statistics.sleep_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
@@ -1011,7 +2676,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
}
if (se->statistics.block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
@@ -1019,6 +2684,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (unlikely(delta > se->statistics.block_max))
se->statistics.block_max = delta;
+ se->statistics.block_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
@@ -1088,9 +2754,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}
/* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
-
- se->vruntime = vruntime;
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1100,7 +2764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
- * through callig update_curr().
+ * through calling update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
@@ -1109,7 +2773,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- update_cfs_load(cfs_rq, 0);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -1134,10 +2798,10 @@ static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}
@@ -1145,10 +2809,10 @@ static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}
@@ -1156,10 +2820,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}
@@ -1175,7 +2839,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
__clear_buddies_skip(se);
}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1184,6 +2848,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1192,9 +2857,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
@@ -1204,7 +2869,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
- update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
/*
@@ -1302,17 +2966,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -1334,7 +3017,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
return se;
}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -1353,6 +3036,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
}
cfs_rq->curr = NULL;
}
@@ -1366,9 +3051,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_curr(cfs_rq);
/*
- * Update share accounting for long-running entities.
+ * Ensure that runnable average is periodically updated.
*/
- update_entity_shares_tick(cfs_rq);
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1399,20 +3086,21 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef HAVE_JUMP_LABEL
-static struct jump_label_key __cfs_bandwidth_used;
+static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
{
- return static_branch(&__cfs_bandwidth_used);
+ return static_key_false(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_inc(void)
+{
+ static_key_slow_inc(&__cfs_bandwidth_used);
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_dec(void)
{
- /* only need to count groups transitioning between enabled/!enabled */
- if (enabled && !was_enabled)
- jump_label_inc(&__cfs_bandwidth_used);
- else if (!enabled && was_enabled)
- jump_label_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -1420,7 +3108,8 @@ static bool cfs_bandwidth_used(void)
return true;
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
#endif /* HAVE_JUMP_LABEL */
/*
@@ -1461,6 +3150,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1483,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
*/
if (!cfs_b->timer_active) {
__refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, false);
}
if (cfs_b->runtime > 0) {
@@ -1514,10 +3212,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct rq *rq = rq_of(cfs_rq);
/* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
return;
if (cfs_rq->runtime_remaining < 0)
@@ -1529,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced.
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
*/
- if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -1541,8 +3240,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
}
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
@@ -1559,8 +3257,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
resched_task(rq_of(cfs_rq)->curr);
}
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
@@ -1605,14 +3303,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
- u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
- /* leaving throttled state, advance shares averaging windows */
- cfs_rq->load_stamp += delta;
- cfs_rq->load_last += delta;
-
- /* update entity weight now that we are on_rq again */
- update_cfs_shares(cfs_rq);
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
+ cfs_rq->throttled_clock_task;
}
#endif
@@ -1624,9 +3317,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, record last load */
+ /* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count)
- update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttled_clock_task = rq_clock_task(rq);
cfs_rq->throttle_count++;
return 0;
@@ -1641,7 +3334,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- /* account load preceding throttle */
+ /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -1662,12 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running -= task_delta;
+ sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
- cfs_rq->throttled_timestamp = rq->clock;
+ cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
}
@@ -1679,16 +3374,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
int enqueue = 1;
long task_delta;
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ se = cfs_rq->tg->se[cpu_of(rq)];
cfs_rq->throttled = 0;
+
+ update_rq_clock(rq);
+
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
- cfs_rq->throttled_timestamp = 0;
- update_rq_clock(rq);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -1710,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running += task_delta;
+ add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -1764,28 +3460,35 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int idle = 1, throttled;
+ int throttled;
- raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
- goto out_unlock;
+ goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- /* idle depends on !throttled (for the case of a large deficit) */
- idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
- /* if we're going inactive then everything else can be deferred */
- if (idle)
- goto out_unlock;
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
+
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- goto out_unlock;
+ return 0;
}
/* account preceding periods in which throttling occurred */
@@ -1825,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
-out_unlock:
- if (idle)
- cfs_b->timer_active = 0;
- raw_spin_unlock(&cfs_b->lock);
- return idle;
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -1840,7 +3543,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -1916,10 +3625,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
u64 expires;
/* confirm we're still not at a refresh boundary */
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
return;
+ }
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
@@ -1963,28 +3674,25 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;
throttle_cfs_rq(cfs_rq);
+ return true;
}
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -2002,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
+ raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -2011,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -2036,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
{
/*
* The timer may be active because we're trying to set a new bandwidth
@@ -2044,14 +3754,14 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
+ cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
- if (cfs_b->timer_active)
+ if (!force && cfs_b->timer_active)
return;
}
@@ -2065,13 +3775,11 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
if (!cfs_rq->runtime_enabled)
continue;
@@ -2079,18 +3787,22 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
- cfs_rq->runtime_remaining = cfs_b->quota;
+ cfs_rq->runtime_remaining = 1;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
}
#else /* CONFIG_CFS_BANDWIDTH */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_task(rq_of(cfs_rq));
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -2119,7 +3831,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -2220,12 +3932,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
}
- if (!se)
- inc_nr_running(rq);
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
+ add_nr_running(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2279,12 +3993,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
}
- if (!se)
- dec_nr_running(rq);
+ if (!se) {
+ sub_nr_running(rq, 1);
+ update_rq_runnable_avg(rq, 1);
+ }
hrtick_update(rq);
}
@@ -2292,7 +4008,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->load.weight;
+ return cpu_rq(cpu)->cfs.runnable_load_avg;
}
/*
@@ -2328,22 +4044,40 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long power_of(int cpu)
+static unsigned long capacity_of(int cpu)
{
- return cpu_rq(cpu)->cpu_power;
+ return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+ unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
- return rq->load.weight / nr_running;
+ return load_avg / nr_running;
return 0;
}
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Rough decay (wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * about the loss.
+ */
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
static void task_waking_fair(struct task_struct *p)
{
@@ -2364,6 +4098,7 @@ static void task_waking_fair(struct task_struct *p)
#endif
se->vruntime -= min_vruntime;
+ record_wakee(p);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2474,14 +4209,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
}
#else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
- unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
return wl;
}
#endif
+static int wake_wide(struct task_struct *p)
+{
+ int factor = this_cpu_read(sd_llc_size);
+
+ /*
+ * Yeah, it's the switching-frequency, could means many wakee or
+ * rapidly switch, use factor here will just help to automatically
+ * adjust the loose-degree, so bigger node will lead to more pull.
+ */
+ if (p->wakee_flips > factor) {
+ /*
+ * wakee is somewhat hot, it needs certain amount of cpu
+ * resource, so if waker is far more hot, prefer to leave
+ * it alone.
+ */
+ if (current->wakee_flips > (factor * p->wakee_flips))
+ return 1;
+ }
+
+ return 0;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
@@ -2491,6 +4247,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
+ /*
+ * If we wake multiple tasks be careful to not bounce
+ * ourselves around too much.
+ */
+ if (wake_wide(p))
+ return 0;
+
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -2526,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
- this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= capacity_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
@@ -2571,12 +4334,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int load_idx)
+ int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ if (sd_flag & SD_BALANCE_WAKE)
+ load_idx = sd->wake_idx;
+
do {
unsigned long load, avg_load;
int local_group;
@@ -2603,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
}
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
if (local_group) {
this_load = avg_load;
@@ -2647,31 +4414,22 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
- int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
struct sched_domain *sd;
struct sched_group *sg;
- int i;
+ int i = task_cpu(p);
- /*
- * If the task is going to be woken-up on this cpu and if it is
- * already idle, then it is the right target.
- */
- if (target == cpu && idle_cpu(cpu))
- return cpu;
+ if (idle_cpu(target))
+ return target;
/*
- * If the task is going to be woken-up on the cpu where it previously
- * ran and if it is currently idle, then it the right target.
+ * If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (target == prev_cpu && idle_cpu(prev_cpu))
- return prev_cpu;
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
- rcu_read_lock();
-
sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) {
sg = sd->groups;
@@ -2681,7 +4439,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;
for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
+ if (i == target || !idle_cpu(i))
goto next;
}
@@ -2693,34 +4451,31 @@ next:
} while (sg != sd->groups);
}
done:
- rcu_read_unlock();
-
return target;
}
/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
- * Balance, ie. select the least loaded group.
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
*
- * Returns the target CPU number, or the same CPU if no balancing is needed.
+ * Returns the target cpu number.
*
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
- int want_sd = 1;
int sync = wake_flags & WF_SYNC;
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
@@ -2735,59 +4490,28 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
continue;
/*
- * If power savings logic is enabled for a domain, see if we
- * are not overloaded, if so, don't balance wider.
- */
- if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
- unsigned long power = 0;
- unsigned long nr_running = 0;
- unsigned long capacity;
- int i;
-
- for_each_cpu(i, sched_domain_span(tmp)) {
- power += power_of(i);
- nr_running += cpu_rq(i)->cfs.nr_running;
- }
-
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-
- if (tmp->flags & SD_POWERSAVINGS_BALANCE)
- nr_running /= 2;
-
- if (nr_running < capacity)
- want_sd = 0;
- }
-
- /*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
- want_affine = 0;
- }
-
- if (!want_sd && !want_affine)
break;
+ }
- if (!(tmp->flags & sd_flag))
- continue;
-
- if (want_sd)
+ if (tmp->flags & sd_flag)
sd = tmp;
}
- if (affine_sd) {
- if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+ if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
while (sd) {
- int load_idx = sd->forkexec_idx;
struct sched_group *group;
int weight;
@@ -2796,10 +4520,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
continue;
}
- if (sd_flag & SD_BALANCE_WAKE)
- load_idx = sd->wake_idx;
-
- group = find_idlest_group(sd, p, cpu, load_idx);
+ group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
@@ -2829,6 +4550,34 @@ unlock:
return new_cpu;
}
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a positive decay-count. It can never
+ * be negative here since on-rq tasks have decay-count == 0.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic_long_add(se->avg.load_avg_contrib,
+ &cfs_rq->removed_load);
+ }
+
+ /* We have migrated, no longer consider this task hot */
+ se->exec_start = 0;
+}
#endif /* CONFIG_SMP */
static unsigned long
@@ -2920,7 +4669,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/*
- * This is possible from callers such as pull_task(), in which we
+ * This is possible from callers such as move_task(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -2955,7 +4704,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
- if (unlikely(p->policy != SCHED_NORMAL))
+ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
@@ -2991,26 +4740,124 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+again:
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
- return NULL;
+ goto idle;
+
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the throttle and
+ * dequeue its entity in the parent(s). Therefore the 'simple'
+ * nr_running test will indeed be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+simple:
+ cfs_rq = &rq->cfs;
+#endif
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ put_prev_task(rq, prev);
do {
- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
+
+idle:
+ new_tasks = idle_balance(rq);
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
+
+ if (new_tasks > 0)
+ goto again;
+
+ return NULL;
}
/*
@@ -3081,27 +4928,174 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
#ifdef CONFIG_SMP
/**************************************************
- * Fair scheduling class load-balancing methods:
- */
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * C_i is the compute capacity of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of cpus doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
+
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+enum fbq_type { regular, remote, all };
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02
+#define LBF_DST_PINNED 0x04
+#define LBF_SOME_PINNED 0x08
+
+struct lb_env {
+ struct sched_domain *sd;
+
+ struct rq *src_rq;
+ int src_cpu;
+
+ int dst_cpu;
+ struct rq *dst_rq;
+
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
+ enum cpu_idle_type idle;
+ long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
+ unsigned int flags;
+
+ unsigned int loop;
+ unsigned int loop_break;
+ unsigned int loop_max;
+
+ enum fbq_type fbq_type;
+};
/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
+ * move_task - move a task from one runqueue to another runqueue.
* Both runqueues must be locked.
*/
-static void pull_task(struct rq *src_rq, struct task_struct *p,
- struct rq *this_rq, int this_cpu)
+static void move_task(struct task_struct *p, struct lb_env *env)
{
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- check_preempt_curr(this_rq, p, 0);
+ deactivate_task(env->src_rq, p, 0);
+ set_task_cpu(p, env->dst_cpu);
+ activate_task(env->dst_rq, p, 0);
+ check_preempt_curr(env->dst_rq, p, 0);
}
/*
* Is this task likely cache-hot:
*/
static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+task_hot(struct task_struct *p, u64 now)
{
s64 delta;
@@ -3129,61 +5123,182 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
return delta < (s64)sysctl_sched_migration_cost;
}
-#define LBF_ALL_PINNED 0x01
-#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
-#define LBF_HAD_BREAK 0x04
-#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT 0x10
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ !(env->sd->flags & SD_NUMA)) {
+ return false;
+ }
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ if (numa_group) {
+ /* Task is already in the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return true;
+
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+ return false;
+
+ if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ return false;
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ if (numa_group) {
+ /* Task is moving within/into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving out of the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ }
+
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
+
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) running (obviously), or
+ * 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
*/
- if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ int cpu;
+
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+ env->flags |= LBF_SOME_PINNED;
+
+ /*
+ * Remember if this task can be migrated to any other cpu in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Also avoid computing new_dst_cpu if we have already computed
+ * one in current iteration.
+ */
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ return 0;
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
+ }
+
return 0;
}
- *lb_flags &= ~LBF_ALL_PINNED;
- if (task_running(rq, p)) {
+ /* Record that we found atleast one task that could run on dst_cpu */
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
- * 1) task is cache cold, or
- * 2) too many balance attempts have failed.
+ * 1) destination numa is preferred
+ * 2) task is cache cold, or
+ * 3) too many balance attempts have failed.
*/
+ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
+ if (!tsk_cache_hot)
+ tsk_cache_hot = migrate_degrades_locality(p, env);
- tsk_cache_hot = task_hot(p, rq->clock_task, sd);
- if (!tsk_cache_hot ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
+ if (migrate_improves_locality(p, env)) {
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
- schedstat_inc(sd, lb_hot_gained[idle]);
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
- if (tsk_cache_hot) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
- return 0;
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+
+ return 1;
}
- return 1;
+
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ return 0;
}
/*
@@ -3193,65 +5308,74 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
*
* Called with both runqueues locked.
*/
-static int
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle)
+static int move_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
- struct cfs_rq *cfs_rq;
- int pinned = 0;
- for_each_leaf_cfs_rq(busiest, cfs_rq) {
- list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
- if (throttled_lb_pair(task_group(p),
- busiest->cpu, this_cpu))
- break;
-
- if (!can_migrate_task(p, busiest, this_cpu,
- sd, idle, &pinned))
- continue;
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (!can_migrate_task(p, env))
+ continue;
- pull_task(busiest, p, this_rq, this_cpu);
- /*
- * Right now, this is only the second place pull_task()
- * is called, so we can safely collect pull_task()
- * stats here rather than inside pull_task().
- */
- schedstat_inc(sd, lb_gained[idle]);
- return 1;
- }
+ move_task(p, env);
+ /*
+ * Right now, this is only the second place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
}
-
return 0;
}
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *lb_flags,
- struct cfs_rq *busiest_cfs_rq)
+static const unsigned int sched_nr_migrate_break = 32;
+
+/*
+ * move_tasks tries to move up to imbalance weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct lb_env *env)
{
- int loops = 0, pulled = 0;
- long rem_load_move = max_load_move;
- struct task_struct *p, *n;
+ struct list_head *tasks = &env->src_rq->cfs_tasks;
+ struct task_struct *p;
+ unsigned long load;
+ int pulled = 0;
- if (max_load_move == 0)
- goto out;
+ if (env->imbalance <= 0)
+ return 0;
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
- list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
- if (loops++ > sysctl_sched_nr_migrate) {
- *lb_flags |= LBF_NEED_BREAK;
+ env->loop++;
+ /* We've more or less seen every task there is, call it quits */
+ if (env->loop > env->loop_max)
+ break;
+
+ /* take a breather every nr_migrate tasks */
+ if (env->loop > env->loop_break) {
+ env->loop_break += sched_nr_migrate_break;
+ env->flags |= LBF_NEED_BREAK;
break;
}
- if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle,
- lb_flags))
- continue;
+ if (!can_migrate_task(p, env))
+ goto next;
+
+ load = task_h_load(p);
+
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- pull_task(busiest, p, this_rq, this_cpu);
+ if ((load / 2) > env->imbalance)
+ goto next;
+
+ move_task(p, env);
pulled++;
- rem_load_move -= p->se.load.weight;
+ env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
@@ -3259,273 +5383,209 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
- if (idle == CPU_NEWLY_IDLE) {
- *lb_flags |= LBF_ABORT;
+ if (env->idle == CPU_NEWLY_IDLE)
break;
- }
#endif
/*
* We only want to steal up to the prescribed amount of
* weighted load.
*/
- if (rem_load_move <= 0)
+ if (env->imbalance <= 0)
break;
+
+ continue;
+next:
+ list_move_tail(&p->se.group_node, tasks);
}
-out:
+
/*
- * Right now, this is one of only two places pull_task() is called,
- * so we can safely collect pull_task() stats here rather than
- * inside pull_task().
+ * Right now, this is one of only two places move_task() is called,
+ * so we can safely collect move_task() stats here rather than
+ * inside move_task().
*/
- schedstat_add(sd, lb_gained[idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], pulled);
- return max_load_move - rem_load_move;
+ return pulled;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* update tg->load_weight by folding this cpu's load_avg
*/
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
{
- struct cfs_rq *cfs_rq;
- unsigned long flags;
- struct rq *rq;
-
- if (!tg->se[cpu])
- return 0;
-
- rq = cpu_rq(cpu);
- cfs_rq = tg->cfs_rq[cpu];
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
- update_cfs_load(cfs_rq, 1);
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
- /*
- * We need to update shares after updating tg->load_weight in
- * order to adjust the weight of groups with long running tasks.
- */
- update_cfs_shares(cfs_rq);
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
- return 0;
+ if (se) {
+ update_entity_load_avg(se, 1);
+ /*
+ * We pivot on our runnable average having decayed to zero for
+ * list removal. This generally implies that all our children
+ * have also been removed (modulo rounding error or bandwidth
+ * control); however, such cases are rare and we can fix these
+ * at enqueue.
+ *
+ * TODO: fix up out-of-order children on enqueue.
+ */
+ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ list_del_leaf_cfs_rq(cfs_rq);
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+ update_rq_runnable_avg(rq, rq->nr_running);
+ }
}
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
{
- struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
- rcu_read_lock();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- update_shares_cpu(cfs_rq->tg, cpu);
+ /*
+ * Note: We may want to consider periodically releasing
+ * rq->lock about these updates so that creating many task
+ * groups does not result in continually extending hold time.
+ */
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
}
- rcu_read_unlock();
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
+ struct rq *rq = rq_of(cfs_rq);
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+ unsigned long now = jiffies;
unsigned long load;
- long cpu = (long)data;
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
+ if (cfs_rq->last_h_load_update == now)
+ return;
- tg->cfs_rq[cpu]->h_load = load;
+ cfs_rq->h_load_next = NULL;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_load_next = se;
+ if (cfs_rq->last_h_load_update == now)
+ break;
+ }
- return 0;
-}
+ if (!se) {
+ cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->last_h_load_update = now;
+ }
-static void update_h_load(long cpu)
-{
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+ while ((se = cfs_rq->h_load_next) != NULL) {
+ load = cfs_rq->h_load;
+ load = div64_ul(load * se->avg.load_avg_contrib,
+ cfs_rq->runnable_load_avg + 1);
+ cfs_rq = group_cfs_rq(se);
+ cfs_rq->h_load = load;
+ cfs_rq->last_h_load_update = now;
+ }
}
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
+static unsigned long task_h_load(struct task_struct *p)
{
- long rem_load_move = max_load_move;
- struct cfs_rq *busiest_cfs_rq;
-
- rcu_read_lock();
- update_h_load(cpu_of(busiest));
-
- for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
- unsigned long busiest_h_load = busiest_cfs_rq->h_load;
- unsigned long busiest_weight = busiest_cfs_rq->load.weight;
- u64 rem_load, moved_load;
-
- if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
- break;
-
- /*
- * empty group or part of a throttled hierarchy
- */
- if (!busiest_cfs_rq->task_weight ||
- throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
- continue;
-
- rem_load = (u64)rem_load_move * busiest_weight;
- rem_load = div_u64(rem_load, busiest_h_load + 1);
-
- moved_load = balance_tasks(this_rq, this_cpu, busiest,
- rem_load, sd, idle, lb_flags,
- busiest_cfs_rq);
-
- if (!moved_load)
- continue;
-
- moved_load *= busiest_h_load;
- moved_load = div_u64(moved_load, busiest_weight + 1);
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
- rem_load_move -= moved_load;
- if (rem_load_move < 0)
- break;
- }
- rcu_read_unlock();
-
- return max_load_move - rem_load_move;
+ update_cfs_rq_h_load(cfs_rq);
+ return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
+ cfs_rq->runnable_load_avg + 1);
}
#else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
{
}
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
+static unsigned long task_h_load(struct task_struct *p)
{
- return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, lb_flags,
- &busiest->cfs);
+ return p->se.avg.load_avg_contrib;
}
#endif
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *lb_flags)
-{
- unsigned long total_load_moved = 0, load_moved;
-
- do {
- load_moved = load_balance_fair(this_rq, this_cpu, busiest,
- max_load_move - total_load_moved,
- sd, idle, lb_flags);
-
- total_load_moved += load_moved;
-
- if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
- break;
-
-#ifdef CONFIG_PREEMPT
- /*
- * NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
- * the critical section.
- */
- if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
- *lb_flags |= LBF_ABORT;
- break;
- }
-#endif
- } while (load_moved && max_load_move > total_load_moved);
-
- return total_load_moved > 0;
-}
-
/********** Helpers for find_busiest_group ************************/
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
- */
-struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *this; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
-
- /** Statistics of this group */
- unsigned long this_load;
- unsigned long this_load_per_task;
- unsigned long this_nr_running;
- unsigned long this_has_capacity;
- unsigned int this_idle_cpus;
-
- /* Statistics of the busiest group */
- unsigned int busiest_idle_cpus;
- unsigned long max_load;
- unsigned long busiest_load_per_task;
- unsigned long busiest_nr_running;
- unsigned long busiest_group_capacity;
- unsigned long busiest_has_capacity;
- unsigned int busiest_group_weight;
-
- int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance; /* Is powersave balance needed for this sd */
- struct sched_group *group_min; /* Least loaded group in sd */
- struct sched_group *group_leader; /* Group which relieves group_min */
- unsigned long min_load_per_task; /* load_per_task in group_min */
- unsigned long leader_nr_running; /* Nr running of group_leader */
- unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-
-/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long load_per_task;
unsigned long group_capacity;
- unsigned long idle_cpus;
- unsigned long group_weight;
+ unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int group_capacity_factor;
+ unsigned int idle_cpus;
+ unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
- int group_has_capacity; /* Is there extra capacity in the group? */
+ int group_has_free_capacity;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
};
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
+};
+
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+ /*
+ * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+ * local_stat because update_sg_lb_stats() does a full clear/assignment.
+ * We must however clear busiest_stat::avg_load because
+ * update_sd_pick_busiest() reads this before assignment.
+ */
+ *sds = (struct sd_lb_stats){
+ .busiest = NULL,
+ .local = NULL,
+ .total_load = 0UL,
+ .total_capacity = 0UL,
+ .busiest_stat = {
+ .avg_load = 0UL,
+ },
+ };
+}
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ *
+ * Return: The load index.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
@@ -3548,159 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
+static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
{
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- sds->power_savings_balance = 0;
- else {
- sds->power_savings_balance = 1;
- sds->min_nr_running = ULONG_MAX;
- sds->leader_nr_running = 0;
- }
+ return SCHED_CAPACITY_SCALE;
}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
-
- if (!sds->power_savings_balance)
- return;
-
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
- !sds->this_nr_running))
- sds->power_savings_balance = 0;
-
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!sds->power_savings_balance ||
- sgs->sum_nr_running >= sgs->group_capacity ||
- !sgs->sum_nr_running)
- return;
-
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sgs->sum_nr_running < sds->min_nr_running) ||
- (sgs->sum_nr_running == sds->min_nr_running &&
- group_first_cpu(group) > group_first_cpu(sds->group_min))) {
- sds->group_min = group;
- sds->min_nr_running = sgs->sum_nr_running;
- sds->min_load_per_task = sgs->sum_weighted_load /
- sgs->sum_nr_running;
- }
-
- /*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sgs->sum_nr_running + 1 > sgs->group_capacity)
- return;
-
- if (sgs->sum_nr_running > sds->leader_nr_running ||
- (sgs->sum_nr_running == sds->leader_nr_running &&
- group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
- sds->group_leader = group;
- sds->leader_nr_running = sgs->sum_nr_running;
- }
+ return default_scale_capacity(sd, cpu);
}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- * under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- if (!sds->power_savings_balance)
- return 0;
-
- if (sds->this != sds->group_leader ||
- sds->group_leader == sds->group_min)
- return 0;
-
- *imbalance = sds->min_load_per_task;
- sds->busiest = sds->group_min;
-
- return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
- return;
-}
-
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- return SCHED_POWER_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -3710,87 +5628,146 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_power(sd, cpu);
+ return default_scale_smt_capacity(sd, cpu);
}
-unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
+ s64 delta;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
- if (unlikely(total < rq->rt_avg)) {
- /* Ensures that power won't end up being negative */
+ delta = rq_clock(rq) - age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
+
+ if (unlikely(total < avg)) {
+ /* Ensures that capacity won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
- if (unlikely((s64)total < SCHED_POWER_SCALE))
- total = SCHED_POWER_SCALE;
+ if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
+ total = SCHED_CAPACITY_SCALE;
- total >>= SCHED_POWER_SHIFT;
+ total >>= SCHED_CAPACITY_SHIFT;
return div_u64(available, total);
}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_POWER_SCALE;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_smt_power(sd, cpu);
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_smt_capacity(sd, cpu);
else
- power *= default_scale_smt_power(sd, cpu);
+ capacity *= default_scale_smt_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
}
- sdg->sgp->power_orig = power;
+ sdg->sgc->capacity_orig = capacity;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_freq_capacity(sd, cpu);
else
- power *= default_scale_freq_power(sd, cpu);
+ capacity *= default_scale_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
- power *= scale_rt_power(cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity *= scale_rt_capacity(cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
- if (!power)
- power = 1;
+ if (!capacity)
+ capacity = 1;
- cpu_rq(cpu)->cpu_power = power;
- sdg->sgp->power = power;
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
}
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power;
+ unsigned long capacity, capacity_orig;
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+ sdg->sgc->next_update = jiffies + interval;
if (!child) {
- update_cpu_power(sd, cpu);
+ update_cpu_capacity(sd, cpu);
return;
}
- power = 0;
+ capacity_orig = capacity = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ struct sched_group_capacity *sgc;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * build_sched_domains() -> init_sched_groups_capacity()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
+ *
+ * This avoids capacity/capacity_orig from being 0 and
+ * causing divide-by-zero issues on boot.
+ *
+ * Runtime updates will correct capacity_orig.
+ */
+ if (unlikely(!rq->sd)) {
+ capacity_orig += capacity_of(cpu);
+ capacity += capacity_of(cpu);
+ continue;
+ }
+
+ sgc = rq->sd->groups->sgc;
+ capacity_orig += sgc->capacity_orig;
+ capacity += sgc->capacity;
+ }
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
- sdg->sgp->power = power;
+ group = child->groups;
+ do {
+ capacity_orig += group->sgc->capacity_orig;
+ capacity += group->sgc->capacity;
+ group = group->next;
+ } while (group != child->groups);
+ }
+
+ sdg->sgc->capacity_orig = capacity_orig;
+ sdg->sgc->capacity = capacity;
}
/*
@@ -3804,142 +5781,157 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_POWER_SCALE
+ * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
*/
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ if (!(sd->flags & SD_SHARE_CPUCAPACITY))
return 0;
/*
- * If ~90% of the cpu_power is still there, we're good.
+ * If ~90% of the cpu_capacity is still there, we're good.
*/
- if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+ if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
return 1;
return 0;
}
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+
+static inline int sg_imbalanced(struct sched_group *group)
+{
+ return group->sgc->imbalance;
+}
+
+/*
+ * Compute the group capacity factor.
+ *
+ * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit unit capacity with that.
+ */
+static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+{
+ unsigned int capacity_factor, smt, cpus;
+ unsigned int capacity, capacity_orig;
+
+ capacity = group->sgc->capacity;
+ capacity_orig = group->sgc->capacity_orig;
+ cpus = group->group_weight;
+
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ capacity_factor = cpus / smt; /* cores */
+
+ capacity_factor = min_t(unsigned,
+ capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
+
+ return capacity_factor;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
-static inline void update_sg_lb_stats(struct sched_domain *sd,
- struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx,
- int local_group, const struct cpumask *cpus,
- int *balance, struct sg_lb_stats *sgs)
+static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sched_group *group, int load_idx,
+ int local_group, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
+ unsigned long load;
int i;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long avg_load_per_task = 0;
- if (local_group)
- balance_cpu = group_first_cpu(group);
+ memset(sgs, 0, sizeof(*sgs));
- /* Tally up the load of all CPUs in the group */
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
- max_nr_running = 0;
-
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
-
+ if (local_group)
load = target_load(i, load_idx);
- } else {
+ else
load = source_load(i, load_idx);
- if (load > max_cpu_load) {
- max_cpu_load = load;
- max_nr_running = rq->nr_running;
- }
- if (min_cpu_load > load)
- min_cpu_load = load;
- }
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (idle != CPU_NEWLY_IDLE && local_group) {
- if (balance_cpu != this_cpu) {
- *balance = 0;
- return;
- }
- update_group_power(sd, this_cpu);
- }
-
- /* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of a task.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
if (sgs->sum_nr_running)
- avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
- sgs->group_imb = 1;
-
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
- SCHED_POWER_SCALE);
- if (!sgs->group_capacity)
- sgs->group_capacity = fix_small_capacity(sd, group);
sgs->group_weight = group->group_weight;
- if (sgs->group_capacity > sgs->sum_nr_running)
- sgs->group_has_capacity = 1;
+ sgs->group_imb = sg_imbalanced(group);
+ sgs->group_capacity_factor = sg_capacity_factor(env, group);
+
+ if (sgs->group_capacity_factor > sgs->sum_nr_running)
+ sgs->group_has_free_capacity = 1;
}
/**
* update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
- * @this_cpu: the current cpu
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
*/
-static bool update_sd_pick_busiest(struct sched_domain *sd,
+static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
- struct sg_lb_stats *sgs,
- int this_cpu)
+ struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->max_load)
+ if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
- if (sgs->sum_nr_running > sgs->group_capacity)
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
return true;
if (sgs->group_imb)
@@ -3950,8 +5942,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- this_cpu < group_first_cpu(sg)) {
+ if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -3962,79 +5954,101 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
return false;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
+ * @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = sd->child;
- struct sched_group *sg = sd->groups;
- struct sg_lb_stats sgs;
+ struct sched_domain *child = env->sd->child;
+ struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
- init_sd_power_savings_stats(sd, sds, idle);
- load_idx = get_sd_load_idx(sd, idle);
+ load_idx = get_sd_load_idx(env->sd, env->idle);
do {
+ struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
- memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
- local_group, cpus, balance, &sgs);
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ if (local_group) {
+ sds->local = sg;
+ sgs = &sds->local_stat;
- if (local_group && !(*balance))
- return;
+ if (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
+ }
+
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
- sds->total_load += sgs.group_load;
- sds->total_pwr += sg->sgp->power;
+ if (local_group)
+ goto next_group;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity to one so that we'll try
+ * first, lower the sg capacity factor to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity. The
+ * these excess tasks, i.e. nr_running < group_capacity_factor. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling && !local_group && sds->this_has_capacity)
- sgs.group_capacity = min(sgs.group_capacity, 1UL);
+ if (prefer_sibling && sds->local &&
+ sds->local_stat.group_has_free_capacity)
+ sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
- if (local_group) {
- sds->this_load = sgs.avg_load;
- sds->this = sg;
- sds->this_nr_running = sgs.sum_nr_running;
- sds->this_load_per_task = sgs.sum_weighted_load;
- sds->this_has_capacity = sgs.group_has_capacity;
- sds->this_idle_cpus = sgs.idle_cpus;
- } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
- sds->max_load = sgs.avg_load;
+ if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
- sds->busiest_nr_running = sgs.sum_nr_running;
- sds->busiest_idle_cpus = sgs.idle_cpus;
- sds->busiest_group_capacity = sgs.group_capacity;
- sds->busiest_load_per_task = sgs.sum_weighted_load;
- sds->busiest_has_capacity = sgs.group_has_capacity;
- sds->busiest_group_weight = sgs.group_weight;
- sds->group_imb = sgs.group_imb;
+ sds->busiest_stat = *sgs;
}
- update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+next_group:
+ /* Now, start updating sd_lb_stats */
+ sds->total_load += sgs->group_load;
+ sds->total_capacity += sgs->group_capacity;
+
sg = sg->next;
- } while (sg != sd->groups);
+ } while (sg != env->sd->groups);
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
}
/**
@@ -4054,32 +6068,30 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
*/
-static int check_asym_packing(struct sched_domain *sd,
- struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
{
int busiest_cpu;
- if (!(sd->flags & SD_ASYM_PACKING))
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
if (!sds->busiest)
return 0;
busiest_cpu = group_first_cpu(sds->busiest);
- if (this_cpu > busiest_cpu)
+ if (env->dst_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
- SCHED_POWER_SCALE);
+ env->imbalance = DIV_ROUND_CLOSEST(
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
+
return 1;
}
@@ -4087,110 +6099,117 @@ static int check_asym_packing(struct sched_domain *sd,
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
+ * @env: The load balancing environment.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
*/
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static inline
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
+ struct sg_lb_stats *local, *busiest;
- if (sds->this_nr_running) {
- sds->this_load_per_task /= sds->this_nr_running;
- if (sds->busiest_load_per_task >
- sds->this_load_per_task)
- imbn = 1;
- } else
- sds->this_load_per_task =
- cpu_avg_load_per_task(this_cpu);
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (!local->sum_nr_running)
+ local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
+ else if (busiest->load_per_task > local->load_per_task)
+ imbn = 1;
- scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->sgp->power;
+ scaled_busy_load_per_task =
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
- if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
- (scaled_busy_load_per_task * imbn)) {
- *imbalance = sds->busiest_load_per_task;
+ if (busiest->avg_load + scaled_busy_load_per_task >=
+ local->avg_load + (scaled_busy_load_per_task * imbn)) {
+ env->imbalance = busiest->load_per_task;
return;
}
/*
* OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
+ * however we may be able to increase total CPU capacity used by
* moving them.
*/
- pwr_now += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load);
- pwr_now /= SCHED_POWER_SCALE;
+ capa_now += busiest->group_capacity *
+ min(busiest->load_per_task, busiest->avg_load);
+ capa_now += local->group_capacity *
+ min(local->load_per_task, local->avg_load);
+ capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->sgp->power;
- if (sds->max_load > tmp)
- pwr_move += sds->busiest->sgp->power *
- min(sds->busiest_load_per_task, sds->max_load - tmp);
+ if (busiest->avg_load > scaled_busy_load_per_task) {
+ capa_move += busiest->group_capacity *
+ min(busiest->load_per_task,
+ busiest->avg_load - scaled_busy_load_per_task);
+ }
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->sgp->power <
- sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->sgp->power) /
- sds->this->sgp->power;
- else
- tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->sgp->power;
- pwr_move += sds->this->sgp->power *
- min(sds->this_load_per_task, sds->this_load + tmp);
- pwr_move /= SCHED_POWER_SCALE;
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
+ } else {
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+ }
+ capa_move += local->group_capacity *
+ min(local->load_per_task, local->avg_load + tmp);
+ capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
- if (pwr_move > pwr_now)
- *imbalance = sds->busiest_load_per_task;
+ if (capa_move > capa_now)
+ env->imbalance = busiest->load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
+ * @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
*/
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
- unsigned long *imbalance)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
+ struct sg_lb_stats *local, *busiest;
- sds->busiest_load_per_task /= sds->busiest_nr_running;
- if (sds->group_imb) {
- sds->busiest_load_per_task =
- min(sds->busiest_load_per_task, sds->avg_load);
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (busiest->group_imb) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure cpu-load equilibrium, look at wider averages. XXX
+ */
+ busiest->load_per_task =
+ min(busiest->load_per_task, sds->avg_load);
}
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
- * its cpu_power, while calculating max_load..)
+ * its cpu_capacity, while calculating max_load..)
*/
- if (sds->max_load < sds->avg_load) {
- *imbalance = 0;
- return fix_small_imbalance(sds, this_cpu, imbalance);
+ if (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return fix_small_imbalance(env, sds);
}
- if (!sds->group_imb) {
+ if (!busiest->group_imb) {
/*
* Don't want to pull so many tasks that a group would go idle.
+ * Except of course for the group_imb case, since then we might
+ * have to drop below capacity to reach cpu-load equilibrium.
*/
- load_above_capacity = (sds->busiest_nr_running -
- sds->busiest_group_capacity);
+ load_above_capacity =
+ (busiest->sum_nr_running - busiest->group_capacity_factor);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-
- load_above_capacity /= sds->busiest->sgp->power;
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+ load_above_capacity /= busiest->group_capacity;
}
/*
@@ -4200,15 +6219,14 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* we also don't want to reduce the group load below the group capacity
* (so that we can implement power-savings policies etc). Thus we look
* for the minimum possible imbalance.
- * Be careful of negative numbers as they'll appear as very large values
- * with unsigned longs.
*/
- max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->sgp->power,
- (sds->avg_load - sds->this_load) * sds->this->sgp->power)
- / SCHED_POWER_SCALE;
+ env->imbalance = min(
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -4216,9 +6234,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < sds->busiest_load_per_task)
- return fix_small_imbalance(sds, this_cpu, imbalance);
-
+ if (env->imbalance < busiest->load_per_task)
+ return fix_small_imbalance(env, sds);
}
/******* find_busiest_group() helpers end here *********************/
@@ -4233,159 +6250,163 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- * be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- * is the appropriate cpu to perform load balancing at this_level.
+ * @env: The load balancing environment.
*
- * Returns: - the busiest group if imbalance exists.
+ * Return: - The busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
*/
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- const struct cpumask *cpus, int *balance)
+static struct sched_group *find_busiest_group(struct lb_env *env)
{
+ struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
- memset(&sds, 0, sizeof(sds));
+ init_sd_lb_stats(&sds);
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+ update_sd_lb_stats(env, &sds);
+ local = &sds.local_stat;
+ busiest = &sds.busiest_stat;
- /*
- * this_cpu is not the appropriate cpu to perform load balancing at
- * this level.
- */
- if (!(*balance))
- goto ret;
-
- if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(sd, &sds, this_cpu, imbalance))
+ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || sds.busiest_nr_running == 0)
+ if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
- * work because they assumes all things are equal, which typically
+ * work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (sds.group_imb)
+ if (busiest->group_imb)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
- !sds.busiest_has_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+ !busiest->group_has_free_capacity)
goto force_balance;
/*
* If the local group is more busy than the selected busiest group
* don't try and pull any tasks.
*/
- if (sds.this_load >= sds.max_load)
+ if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/*
* Don't pull any tasks if this group is already above the domain
* average load.
*/
- if (sds.this_load >= sds.avg_load)
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
- if (idle == CPU_IDLE) {
+ if (env->idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
- if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
- sds.busiest_nr_running <= sds.busiest_group_weight)
+ if ((local->idle_cpus < busiest->idle_cpus) &&
+ busiest->sum_nr_running <= busiest->group_weight)
goto out_balanced;
} else {
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ if (100 * busiest->avg_load <=
+ env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
- calculate_imbalance(&sds, this_cpu, imbalance);
+ calculate_imbalance(env, &sds);
return sds.busiest;
out_balanced:
- /*
- * There is no obvious imbalance. But check if we can do some balancing
- * to save power.
- */
- if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
- return sds.busiest;
-ret:
- *imbalance = 0;
+ env->imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static struct rq *
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
- enum cpu_idle_type idle, unsigned long imbalance,
- const struct cpumask *cpus)
+static struct rq *find_busiest_queue(struct lb_env *env,
+ struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long max_load = 0;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
- for_each_cpu(i, sched_group_cpus(group)) {
- unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
- SCHED_POWER_SCALE);
- unsigned long wl;
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ unsigned long capacity, capacity_factor, wl;
+ enum fbq_type rt;
- if (!capacity)
- capacity = fix_small_capacity(sd, group);
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
- if (!cpumask_test_cpu(i, cpus))
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
continue;
- rq = cpu_rq(i);
+ capacity = capacity_of(i);
+ capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
+
wl = weighted_cpuload(i);
/*
* When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu power.
+ * which is not scaled with the cpu capacity.
*/
- if (capacity && rq->nr_running == 1 && wl > imbalance)
+ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu power, so that
- * the load can be moved away from the cpu that is potentially
- * running at a lower capacity.
+ * the weighted_cpuload() scaled with the cpu capacity, so
+ * that the load can be moved away from the cpu that is
+ * potentially running at a lower capacity.
+ *
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * multiplication to rid ourselves of the division works out
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
*/
- wl = (wl * SCHED_POWER_SCALE) / power;
-
- if (wl > max_load) {
- max_load = wl;
+ if (wl * busiest_capacity > busiest_load * capacity) {
+ busiest_load = wl;
+ busiest_capacity = capacity;
busiest = rq;
}
}
@@ -4400,42 +6421,21 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-static int need_active_balance(struct sched_domain *sd, int idle,
- int busiest_cpu, int this_cpu)
+static int need_active_balance(struct lb_env *env)
{
- if (idle == CPU_NEWLY_IDLE) {
+ struct sched_domain *sd = env->sd;
+
+ if (env->idle == CPU_NEWLY_IDLE) {
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* higher numbered CPUs in order to pack all tasks in the
* lowest numbered CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
return 1;
-
- /*
- * The only task running in a non-idle cpu can be moved to this
- * cpu in an attempt to completely freeup the other CPU
- * package.
- *
- * The package power saving logic comes from
- * find_busiest_group(). If there are no imbalance, then
- * f_b_g() will return NULL. However when sched_mc={1,2} then
- * f_b_g() will select a group from which a running task may be
- * pulled to this cpu in order to make the other package idle.
- * If there is no opportunity to make a package idle and if
- * there are no imbalance, then f_b_g() will return NULL and no
- * action will be taken in load_balance_newidle().
- *
- * Under normal task pull operation due to imbalance, there
- * will be more than one task in the source run queue and
- * move_tasks() will succeed. ld_moved will be true and this
- * active balance code will not be triggered.
- */
- if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
- return 0;
}
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4443,46 +6443,98 @@ static int need_active_balance(struct sched_domain *sd, int idle,
static int active_load_balance_cpu_stop(void *data);
+static int should_we_balance(struct lb_env *env)
+{
+ struct sched_group *sg = env->sd->groups;
+ struct cpumask *sg_cpus, *sg_mask;
+ int cpu, balance_cpu = -1;
+
+ /*
+ * In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ return 1;
+
+ sg_cpus = sched_group_cpus(sg);
+ sg_mask = sched_group_mask(sg);
+ /* Try to find first idle cpu */
+ for_each_cpu_and(cpu, sg_cpus, env->cpus) {
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ continue;
+
+ balance_cpu = cpu;
+ break;
+ }
+
+ if (balance_cpu == -1)
+ balance_cpu = group_balance_cpu(sg);
+
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above domains.
+ */
+ return balance_cpu == env->dst_cpu;
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance)
+ int *continue_balancing)
{
- int ld_moved, lb_flags = 0, active_balance = 0;
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
- unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+ struct cpumask *cpus = __get_cpu_var(load_balance_mask);
+
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ };
+
+ /*
+ * For NEWLY_IDLE load_balancing, we don't need to consider
+ * other cpus in our group
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ env.dst_grpmask = NULL;
cpumask_copy(cpus, cpu_active_mask);
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, idle,
- cpus, balance);
-
- if (*balance == 0)
+ if (!should_we_balance(&env)) {
+ *continue_balancing = 0;
goto out_balanced;
+ }
+ group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
- BUG_ON(busiest == this_rq);
+ BUG_ON(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], imbalance);
+ schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {
@@ -4492,35 +6544,92 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- lb_flags |= LBF_ALL_PINNED;
+ env.flags |= LBF_ALL_PINNED;
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+
+more_balance:
local_irq_save(flags);
- double_rq_lock(this_rq, busiest);
- ld_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle, &lb_flags);
- double_rq_unlock(this_rq, busiest);
+ double_rq_lock(env.dst_rq, busiest);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = move_tasks(&env);
+ ld_moved += cur_ld_moved;
+ double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
- if (ld_moved && this_cpu != smp_processor_id())
- resched_cpu(this_cpu);
+ if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+ resched_cpu(env.dst_cpu);
- if (lb_flags & LBF_ABORT)
- goto out_balanced;
+ if (env.flags & LBF_NEED_BREAK) {
+ env.flags &= ~LBF_NEED_BREAK;
+ goto more_balance;
+ }
- if (lb_flags & LBF_NEED_BREAK) {
- lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
- if (lb_flags & LBF_ABORT)
- goto out_balanced;
- goto redo;
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of cpus in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
+ env.dst_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_DST_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
+
+ /*
+ * We failed to reach balance because of affinity.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ *group_imbalance = 1;
+ } else if (*group_imbalance)
+ *group_imbalance = 0;
}
/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(lb_flags & LBF_ALL_PINNED)) {
+ if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus))
+ if (!cpumask_empty(cpus)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
goto redo;
+ }
goto out_balanced;
}
}
@@ -4536,7 +6645,7 @@ redo:
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
- if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+ if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -4547,7 +6656,7 @@ redo:
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
- lb_flags |= LBF_ALL_PINNED;
+ env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
@@ -4563,10 +6672,11 @@ redo:
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
- if (active_balance)
+ if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ }
/*
* We've kicked active balancing, reset the failure
@@ -4600,7 +6710,7 @@ out_balanced:
out_one_pinned:
/* tune up the balancing interval */
- if (((lb_flags & LBF_ALL_PINNED) &&
+ if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
@@ -4610,60 +6720,135 @@ out:
return ld_moved;
}
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ interval = get_sd_balance_interval(sd, cpu_busy);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-void idle_balance(int this_cpu, struct rq *this_rq)
+static int idle_balance(struct rq *this_rq)
{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
- unsigned long next_balance = jiffies + HZ;
+ u64 curr_cost = 0;
- this_rq->idle_stamp = this_rq->clock;
+ idle_enter_fair(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
- return;
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, 0, &next_balance);
+ rcu_read_unlock();
+
+ goto out;
+ }
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
- update_shares(this_cpu);
+ update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- unsigned long interval;
- int balance = 1;
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, 0, &next_balance);
+ break;
+ }
+
if (sd->flags & SD_BALANCE_NEWIDLE) {
- /* If we've pulled tasks over stop searching: */
+ t0 = sched_clock_cpu(this_cpu);
+
pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE, &balance);
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
}
- interval = msecs_to_jiffies(sd->balance_interval);
- if (time_after(next_balance, sd->last_balance + interval))
- next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ update_next_balance(sd, 0, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
break;
- }
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
- /*
- * We are going idle. next_balance may be set based on
- * a busy processor. So reset next_balance.
- */
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task) {
+ idle_exit_fair(this_rq);
+ this_rq->idle_stamp = 0;
}
+
+ return pulled_task;
}
/*
@@ -4710,10 +6895,18 @@ static int active_load_balance_cpu_stop(void *data)
}
if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
schedstat_inc(sd, alb_count);
- if (move_one_task(target_rq, target_cpu, busiest_rq,
- sd, CPU_IDLE))
+ if (move_one_task(&env))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
@@ -4726,7 +6919,12 @@ out_unlock:
return 0;
}
-#ifdef CONFIG_NO_HZ
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -4739,117 +6937,28 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu: The cpu whose lowest level of sched domain is to
- * be returned.
- * @flag: The flag to check for the lowest sched_domain
- * for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
- struct sched_domain *sd;
-
- for_each_domain(cpu, sd)
- if (sd->flags & flag)
- break;
-
- return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu: The cpu whose domains we're iterating over.
- * @sd: variable holding the value of the power_savings_sd
- * for cpu.
- * @flag: The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
- for (sd = lowest_flag_domain(cpu, flag); \
- (sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu: The cpu which is nominating a new idle_load_balancer.
- *
- * Returns: Returns the id of the idle load balancer if it exists,
- * Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
+static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
- struct sched_group *ilbg;
- struct sched_domain *sd;
-
- /*
- * Have idle load balancer selection from semi-idle packages only
- * when power-aware load balancing is enabled
- */
- if (!(sched_smt_power_savings || sched_mc_power_savings))
- goto out_done;
-
- /*
- * Optimize for the case when we have no idle CPUs or only one
- * idle CPU. Don't walk the sched_domain hierarchy in such cases
- */
- if (cpumask_weight(nohz.idle_cpus_mask) < 2)
- goto out_done;
-
- rcu_read_lock();
- for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilbg = sd->groups;
-
- do {
- if (ilbg->group_weight !=
- atomic_read(&ilbg->sgp->nr_busy_cpus)) {
- ilb = cpumask_first_and(nohz.idle_cpus_mask,
- sched_group_cpus(ilbg));
- goto unlock;
- }
-
- ilbg = ilbg->next;
- } while (ilbg != sd->groups);
- }
-unlock:
- rcu_read_unlock();
-
-out_done:
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
return nr_cpu_ids;
}
-#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
- return nr_cpu_ids;
-}
-#endif
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(int cpu)
+static void nohz_balancer_kick(void)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb(cpu);
+ ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
@@ -4866,11 +6975,16 @@ static void nohz_balancer_kick(int cpu)
return;
}
-static inline void clear_nohz_tick_stopped(int cpu)
+static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
+ /*
+ * Completely isolated CPUs don't ever set, so we must test.
+ */
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
@@ -4880,13 +6994,15 @@ static inline void set_cpu_sd_state_busy(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -4895,47 +7011,50 @@ void set_cpu_sd_state_idle(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
/*
- * This routine will record that this cpu is going idle with tick stopped.
+ * This routine will record that the cpu is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
-void select_nohz_load_balancer(int stop_tick)
+void nohz_balance_enter_idle(int cpu)
{
- int cpu = smp_processor_id();
-
/*
* If this cpu is going down, then nothing needs to be done.
*/
if (!cpu_active(cpu))
return;
- if (stop_tick) {
- if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
- return;
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+ return;
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
- set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- }
- return;
+ /*
+ * If we're a completely isolated CPU, we don't play.
+ */
+ if (on_null_domain(cpu_rq(cpu)))
+ return;
+
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+static int sched_ilb_notifier(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DYING:
- clear_nohz_tick_stopped(smp_processor_id());
+ nohz_balance_exit_idle(smp_processor_id());
return NOTIFY_OK;
default:
return NOTIFY_DONE;
@@ -4945,8 +7064,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
static DEFINE_SPINLOCK(balancing);
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
@@ -4960,50 +7077,69 @@ void update_max_interval(void)
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
- int balance = 1;
- struct rq *rq = cpu_rq(cpu);
+ int continue_balancing = 1;
+ int cpu = rq->cpu;
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize;
+ int need_serialize, need_decay = 0;
+ u64 max_cost = 0;
- update_shares(cpu);
+ update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
+ /*
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains. Decay ~1% per second.
+ */
+ if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+ sd->max_newidle_lb_cost =
+ (sd->max_newidle_lb_cost * 253) / 256;
+ sd->next_decay_max_lb_cost = jiffies + HZ;
+ need_decay = 1;
+ }
+ max_cost += sd->max_newidle_lb_cost;
+
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- interval = sd->balance_interval;
- if (idle != CPU_IDLE)
- interval *= sd->busy_factor;
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!continue_balancing) {
+ if (need_decay)
+ continue;
+ break;
+ }
- /* scale ms to jiffies */
- interval = msecs_to_jiffies(interval);
- interval = clamp(interval, 1UL, max_load_balance_interval);
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
need_serialize = sd->flags & SD_SERIALIZE;
-
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
- * We've pulled tasks over so either we're no
- * longer idle.
+ * The LBF_DST_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
*/
- idle = CPU_NOT_IDLE;
+ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -5012,14 +7148,14 @@ out:
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
-
+ }
+ if (need_decay) {
/*
- * Stop the load balance at this level. There is another
- * CPU in our sched group which is doing load balancing more
- * actively.
+ * Ensure the rq-wide value also decays but keep it at a
+ * reasonable floor to avoid funnies with rq->avg_idle.
*/
- if (!balance)
- break;
+ rq->max_idle_balance_cost =
+ max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
@@ -5032,14 +7168,14 @@ out:
rq->next_balance = next_balance;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- struct rq *this_rq = cpu_rq(this_cpu);
+ int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
@@ -5059,14 +7195,20 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
if (need_resched())
break;
- raw_spin_lock_irq(&this_rq->lock);
- update_rq_clock(this_rq);
- update_cpu_load(this_rq);
- raw_spin_unlock_irq(&this_rq->lock);
+ rq = cpu_rq(balance_cpu);
- rebalance_domains(balance_cpu, CPU_IDLE);
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ rebalance_domains(rq, CPU_IDLE);
+ }
- rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
}
@@ -5080,16 +7222,18 @@ end:
* of an idle cpu is the system.
* - This rq has more than one task.
* - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's power.
+ * busy cpu's exceeding the group's capacity.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
+static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
+ struct sched_group_capacity *sgc;
+ int nr_busy, cpu = rq->cpu;
- if (unlikely(idle_cpu(cpu)))
+ if (unlikely(rq->idle_balance))
return 0;
/*
@@ -5097,7 +7241,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
- clear_nohz_tick_stopped(cpu);
+ nohz_balance_exit_idle(cpu);
/*
* None are in tickless mode and hence no need for NOHZ idle load
@@ -5113,22 +7257,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- for_each_domain(cpu, sd) {
- struct sched_group *sg = sd->groups;
- struct sched_group_power *sgp = sg->sgp;
- int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
- goto need_kick_unlock;
+ if (sd) {
+ sgc = sd->groups->sgc;
+ nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
- && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+ if (nr_busy > 1)
goto need_kick_unlock;
-
- if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
- break;
}
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
rcu_read_unlock();
return 0;
@@ -5138,7 +7282,7 @@ need_kick:
return 1;
}
#else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
#endif
/*
@@ -5147,38 +7291,34 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
*/
static void run_rebalance_domains(struct softirq_action *h)
{
- int this_cpu = smp_processor_id();
- struct rq *this_rq = cpu_rq(this_cpu);
+ struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_cpu, idle);
+ rebalance_domains(this_rq, idle);
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- nohz_idle_balance(this_cpu, idle);
-}
-
-static inline int on_null_domain(int cpu)
-{
- return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+ nohz_idle_balance(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq)
{
/* Don't need to rebalance while attached to NULL domain */
- if (time_after_eq(jiffies, rq->next_balance) &&
- likely(!on_null_domain(cpu)))
+ if (unlikely(on_null_domain(rq)))
+ return;
+
+ if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
- if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
- nohz_balancer_kick(cpu);
+#ifdef CONFIG_NO_HZ_COMMON
+ if (nohz_kick_needed(rq))
+ nohz_balancer_kick();
#endif
}
@@ -5190,6 +7330,9 @@ static void rq_online_fair(struct rq *rq)
static void rq_offline_fair(struct rq *rq)
{
update_sysctl();
+
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
@@ -5206,6 +7349,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ if (numabalancing_enabled)
+ task_tick_numa(rq, curr);
+
+ update_rq_runnable_avg(rq, 1);
}
/*
@@ -5228,11 +7376,15 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
@@ -5282,15 +7434,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when its
+ * Ensure the task's vruntime is normalized, so that when it's
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it was on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it was !on_rq, then only when
+ * If it's on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !on_rq, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!se->on_rq && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -5298,6 +7450,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
+
+#ifdef CONFIG_SMP
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (se->avg.decay_count) {
+ __synchronize_entity_decay(se);
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ }
+#endif
}
/*
@@ -5305,7 +7469,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- if (!p->se.on_rq)
+ struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+ if (!se->on_rq)
return;
/*
@@ -5340,16 +7512,22 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
- INIT_LIST_HEAD(&cfs_rq->tasks);
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
+#ifdef CONFIG_SMP
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic_long_set(&cfs_rq->removed_load, 0);
+#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5375,14 +7553,26 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;
if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
- if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+ if (!on_rq) {
+ cfs_rq = cfs_rq_of(se);
+ se->vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * migrate_task_rq_fair() will have removed our previous
+ * contribution, but we must synchronize for ongoing future
+ * decay.
+ */
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+#endif
+ }
}
void free_fair_sched_group(struct task_group *tg)
@@ -5467,10 +7657,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
cfs_rq->tg = tg;
cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
@@ -5480,13 +7666,17 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
if (!se)
return;
- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }
se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
@@ -5517,6 +7707,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /* Possible calls to update_curr() need rq clock */
+ update_rq_clock(rq);
for_each_sched_entity(se)
update_cfs_shares(group_cfs_rq(se));
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5550,7 +7743,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
@@ -5572,6 +7765,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
+ .migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
@@ -5611,7 +7805,8 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
+ nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d..90284d117fe 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
SCHED_FEAT(START_DEBIT, true)
/*
- * Based on load and program behaviour, see if it makes sense to place
- * a newly woken task on the same cpu as the task that woke it --
- * improve cache locality. Typically used with SYNC wakeups as
- * generated by pipes and the like, see also SYNC_WAKEUPS.
- */
-SCHED_FEAT(AFFINE_WAKEUPS, true)
-
-/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
@@ -40,25 +32,23 @@ SCHED_FEAT(LAST_BUDDY, true)
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
- * Use arch dependent cpu power functions
+ * Allow wakeup-time preemption of the current task:
*/
-SCHED_FEAT(ARCH_POWER, false)
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+
+/*
+ * Use arch dependent cpu capacity functions
+ */
+SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
-/*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
*/
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
@@ -68,3 +58,28 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA, false)
+
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 00000000000..cf009fb0bc2
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,273 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+#include "sched.h"
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+ rcu_idle_enter();
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ while (!tif_need_resched())
+ cpu_relax();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ rcu_idle_exit();
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+ local_irq_enable();
+}
+
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
+ */
+static void cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state;
+ bool broadcast;
+
+ /*
+ * Check if the idle task must be rescheduled. If it is the
+ * case, exit the function after re-enabling the local irq.
+ */
+ if (need_resched()) {
+ local_irq_enable();
+ return;
+ }
+
+ /*
+ * During the idle period, stop measuring the disabled irqs
+ * critical sections latencies
+ */
+ stop_critical_timings();
+
+ /*
+ * Tell the RCU framework we are entering an idle section,
+ * so no more rcu read side critical sections and one more
+ * step to the grace period
+ */
+ rcu_idle_enter();
+
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch idle method on errors.
+ */
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0) {
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
+ }
+
+
+ /*
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+
+ /*
+ * Tell the time framework to switch to a broadcast timer
+ * because our local timer will be shutdown. If a local timer
+ * is used from another cpu as a broadcast timer, this call may
+ * fail if it is not available
+ */
+ if (broadcast &&
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ goto use_default;
+
+ trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+
+exit_idle:
+ __current_set_polling();
+
+ /*
+ * It is up to the idle functions to reenable local interrupts
+ */
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
+
+ rcu_idle_exit();
+ start_critical_timings();
+}
+
+/*
+ * Generic idle loop implementation
+ *
+ * Called with polling cleared.
+ */
+static void cpu_idle_loop(void)
+{
+ while (1) {
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if
+ * rq->curr != rq->idle). This means that, if rq->idle has
+ * the polling bit set, then setting need_resched is
+ * guaranteed to cause the cpu to reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id()))
+ arch_cpu_idle_dead();
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin.
+ *
+ * Also if we detected in the wakeup from idle
+ * path that the tick broadcast device expired
+ * for us, we don't want to go deep idle as we
+ * know that the IPI is going to arrive right
+ * away
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+
+ arch_cpu_idle_exit();
+ }
+
+ /*
+ * Since we fell out of the loop above, we know
+ * TIF_NEED_RESCHED must be set, propagate it into
+ * PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will
+ * not have had an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending and reschedule
+ * if need_resched is set while polling is set. That
+ * means that clearing polling needs to be visible
+ * before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
+ }
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ /*
+ * This #ifdef needs to die, but it's too late in the cycle to
+ * make this generic (arm and sh have never invoked the canary
+ * init for the non boot cpus!). Will be fixed in 3.11
+ */
+#ifdef CONFIG_X86
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. The boot CPU already has it initialized but no harm
+ * in doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+#endif
+ arch_cpu_idle_prepare();
+ cpu_idle_loop();
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f28..879f2b75266 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,16 +4,17 @@
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
- * handled in sched_fair.c)
+ * handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
+
/*
* Idle tasks are unconditionally rescheduled:
*/
@@ -22,10 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_task(rq->idle);
}
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
- calc_load_account_idle(rq);
return rq->idle;
}
@@ -44,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 00000000000..16f5a30f9c8
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
+/*
+ * kernel/sched/proc.c
+ *
+ * Kernel load calculations, forked from sched/core.c
+ */
+
+#include <linux/export.h>
+
+#include "sched.h"
+
+unsigned long this_cpu_load(void)
+{
+ struct rq *this = this_rq();
+ return this->cpu_load[0];
+}
+
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
+ long delta;
+
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
+ delta = calc_load_fold_active(this_rq);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
+}
+
+void calc_load_exit_idle(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the sample window, we're done.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+
+ return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+ long delta, active, n;
+
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ long active, delta;
+
+ if (time_before(jiffies, calc_load_update + 10))
+ return;
+
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+
+ /*
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+ */
+ calc_global_nohz();
+}
+
+/*
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long delta;
+
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ delta = calc_load_fold_active(this_rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * End of global load-average stuff
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_SMP
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->cfs.runnable_load_avg;
+}
+#else
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->load.weight;
+}
+#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long load = get_rq_runnable_load(this_rq);
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = get_rq_runnable_load(this_rq);
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
+
+ calc_load_account_active(this_rq);
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7fb5ec..a49083192c6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
#include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
+
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
@@ -77,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -110,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
void free_rt_sched_group(struct task_group *tg)
{
int i;
@@ -209,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
return &rq->rt;
}
@@ -227,6 +244,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ return rq->rt.highest_prio.curr > prev->prio;
+}
+
static inline int rt_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->rto_count);
@@ -244,8 +269,10 @@ static inline void rt_set_overload(struct rq *rq)
* if we should look at the mask. It would be a shame
* if we looked at the mask, but the mask was not
* updated yet.
+ *
+ * Matched by the barrier in pull_rt_task().
*/
- wmb();
+ smp_wmb();
atomic_inc(&rq->rd->rto_count);
}
@@ -274,13 +301,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -288,13 +318,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -305,6 +338,15 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
+static inline void set_post_schedule(struct rq *rq)
+{
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+}
+
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -349,8 +391,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -391,20 +449,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
(iter = next_task_group(iter)) && \
(rt_rq = iter->rt_rq[cpu_of(rq)]);)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
- list_add_rcu(&rt_rq->leaf_rt_rq_list,
- &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
- list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
- list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -426,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
- if (rt_se && !on_rt_rq(rt_se))
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
+
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
@@ -440,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (rt_se && on_rt_rq(rt_se))
+ if (!rt_se)
+ dequeue_top_rt_rq(rt_rq);
+ else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
}
@@ -464,7 +513,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
- return cpu_rq(smp_processor_id())->rd->span;
+ return this_rq()->rd->span;
}
#else
static inline const struct cpumask *sched_rt_period_mask(void)
@@ -501,17 +550,6 @@ typedef struct rt_rq *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
- for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
@@ -522,12 +560,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_running)
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_task(rq->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
+ dequeue_top_rt_rq(rt_rq);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -553,6 +597,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
@@ -560,7 +612,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
int i, weight, more = 0;
u64 rt_period;
@@ -685,20 +737,12 @@ balanced:
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
}
-static void disable_runtime(struct rq *rq)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __disable_runtime(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static void __enable_runtime(struct rq *rq)
{
rt_rq_iter_t iter;
@@ -723,37 +767,6 @@ static void __enable_runtime(struct rq *rq)
}
}
-static void enable_runtime(struct rq *rq)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __enable_runtime(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
- switch (action) {
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- disable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- enable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- default:
- return NOTIFY_DONE;
- }
-}
-
static int balance_runtime(struct rt_rq *rt_rq)
{
int more = 0;
@@ -778,13 +791,23 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
- int i, idle = 1;
+ int i, idle = 1, throttled = 0;
const struct cpumask *span;
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return 1;
-
span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -818,12 +841,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (!rt_rq_throttled(rt_rq))
enqueue = 1;
}
+ if (rt_rq->rt_throttled)
+ throttled = 1;
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
raw_spin_unlock(&rq->lock);
}
+ if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
+ return 1;
+
return idle;
}
@@ -855,8 +883,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
if (rt_rq->rt_time > runtime) {
- rt_rq->rt_throttled = 1;
- printk_once(KERN_WARNING "sched: RT throttling activated\n");
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ /*
+ * Don't actually throttle groups that have no runtime assigned
+ * but accrue some time due to boosting.
+ */
+ if (likely(rt_b->rt_runtime)) {
+ rt_rq->rt_throttled = 1;
+ printk_deferred_once("sched: RT throttling activated\n");
+ } else {
+ /*
+ * In case we did anyway, make it go away,
+ * replenishment is a joke, since it will replenish us
+ * with exactly 0 ns.
+ */
+ rt_rq->rt_time = 0;
+ }
+
if (rt_rq_throttled(rt_rq)) {
sched_rt_rq_dequeue(rt_rq);
return 1;
@@ -874,22 +918,22 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- delta_exec = rq->clock_task - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
- schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock_task;
+ curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -898,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
@@ -910,6 +954,38 @@ static void update_curr_rt(struct rq *rq)
}
}
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+ if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+ return;
+
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+}
+
#if defined CONFIG_SMP
static void
@@ -917,6 +993,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -926,6 +1009,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
@@ -1019,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
- rt_rq->rt_nr_running++;
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1036,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
@@ -1059,9 +1160,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
- if (!rt_rq->rt_nr_running)
- list_add_leaf_rt_rq(rt_rq);
-
if (head)
list_add(&rt_se->run_list, queue);
else
@@ -1081,8 +1179,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
- if (!rt_rq->rt_nr_running)
- list_del_leaf_rt_rq(rt_rq);
}
/*
@@ -1098,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
back = rt_se;
}
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1106,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
+ enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
@@ -1121,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
+ enqueue_top_rt_rq(&rq->rt);
}
/*
@@ -1136,10 +1240,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
-
- inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1150,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
-
- dec_nr_running(rq);
}
/*
@@ -1192,15 +1292,12 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
- int cpu;
-
- cpu = task_cpu(p);
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
goto out;
/* For anything but wake ups, just return the task_cpu */
@@ -1235,9 +1332,8 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (curr->rt.nr_cpus_allowed < 2 ||
- curr->prio <= p->prio) &&
- (p->rt.nr_cpus_allowed > 1)) {
+ (curr->nr_cpus_allowed < 2 ||
+ curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1251,10 +1347,10 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->rt.nr_cpus_allowed == 1)
+ if (rq->curr->nr_cpus_allowed == 1)
return;
- if (p->rt.nr_cpus_allowed != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1321,15 +1417,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
- struct rt_rq *rt_rq;
-
- rt_rq = &rq->rt;
-
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
- return NULL;
+ struct rt_rq *rt_rq = &rq->rt;
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1338,26 +1426,48 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
} while (rt_rq);
p = rt_task_of(rt_se);
- p->se.exec_start = rq->clock_task;
+ p->se.exec_start = rq_clock_task(rq);
return p;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (need_pull_rt_task(rq, prev)) {
+ pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
+ */
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }
+
+ /*
+ * We may dequeue prev's rt_rq in put_prev_task().
+ * So, we update time before rt_nr_running check.
+ */
+ if (prev->sched_class == &rt_sched_class)
+ update_curr_rt(rq);
+
+ if (!rt_rq->rt_queued)
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
-#ifdef CONFIG_SMP
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
-#endif
+ set_post_schedule(rq);
return p;
}
@@ -1370,7 +1480,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1382,48 +1492,29 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->rt.nr_cpus_allowed > 1))
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
return 1;
return 0;
}
-/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+/*
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the cpu, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{
- struct task_struct *next = NULL;
- struct sched_rt_entity *rt_se;
- struct rt_prio_array *array;
- struct rt_rq *rt_rq;
- int idx;
-
- for_each_leaf_rt_rq(rt_rq, rq) {
- array = &rt_rq->active;
- idx = sched_find_first_bit(array->bitmap);
-next_idx:
- if (idx >= MAX_RT_PRIO)
- continue;
- if (next && next->prio < idx)
- continue;
- list_for_each_entry(rt_se, array->queue + idx, run_list) {
- struct task_struct *p;
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *p;
- if (!rt_entity_is_task(rt_se))
- continue;
+ if (!has_pushable_tasks(rq))
+ return NULL;
- p = rt_task_of(rt_se);
- if (pick_rt_task(rq, p, cpu)) {
- next = p;
- break;
- }
- }
- if (!next) {
- idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
- goto next_idx;
- }
+ plist_for_each_entry(p, head, pushable_tasks) {
+ if (pick_rt_task(rq, p, cpu))
+ return p;
}
- return next;
+ return NULL;
}
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1439,7 +1530,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (task->rt.nr_cpus_allowed == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1531,7 +1622,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
task_running(rq, task) ||
!task->on_rq)) {
- raw_spin_unlock(&lowest_rq->lock);
+ double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
break;
}
@@ -1561,7 +1652,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
@@ -1587,11 +1678,6 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- if (unlikely(task_running(rq, next_task)))
- return 0;
-#endif
-
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
@@ -1677,6 +1763,12 @@ static int pull_rt_task(struct rq *this_rq)
if (likely(!rt_overloaded(this_rq)))
return 0;
+ /*
+ * Match the barrier from rt_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the rto_mask bit.
+ */
+ smp_rmb();
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1702,12 +1794,10 @@ static int pull_rt_task(struct rq *this_rq)
double_lock_balance(this_rq, src_rq);
/*
- * Are there still pullable RT tasks?
+ * We can pull only a task, which is pushable
+ * on its rq, and no others.
*/
- if (src_rq->rt.rt_nr_running <= 1)
- goto skip;
-
- p = pick_next_highest_task_rt(src_rq, this_cpu);
+ p = pick_highest_pushable_task(src_rq, this_cpu);
/*
* Do we have an RT task that preempts
@@ -1747,13 +1837,6 @@ skip:
return ret;
}
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull RT tasks here if we lower this rq's prio */
- if (rq->rt.highest_prio.curr > prev->prio)
- pull_rt_task(rq);
-}
-
static void post_schedule_rt(struct rq *rq)
{
push_rt_tasks(rq);
@@ -1768,9 +1851,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1 &&
- rt_task(rq->curr) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -1778,44 +1861,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_rt(struct task_struct *p,
const struct cpumask *new_mask)
{
- int weight = cpumask_weight(new_mask);
+ struct rq *rq;
+ int weight;
BUG_ON(!rt_task(p));
- /*
- * Update the migration status of the RQ if we have an RT task
- * which is running AND changing its weight value.
- */
- if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
- struct rq *rq = task_rq(p);
-
- if (!task_current(rq, p)) {
- /*
- * Make sure we dequeue this task from the pushable list
- * before going further. It will either remain off of
- * the list because we are no longer pushable, or it
- * will be requeued.
- */
- if (p->rt.nr_cpus_allowed > 1)
- dequeue_pushable_task(rq, p);
+ if (!p->on_rq)
+ return;
- /*
- * Requeue if our weight is changing and still > 1
- */
- if (weight > 1)
- enqueue_pushable_task(rq, p);
+ weight = cpumask_weight(new_mask);
- }
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
+ return;
- if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
- rq->rt.rt_nr_migratory++;
- } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- }
+ rq = task_rq(p);
- update_rt_migration(&rq->rt);
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_task(rq, p);
+ BUG_ON(!rq->rt.rt_nr_migratory);
+ rq->rt.rt_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_task(rq, p);
+ rq->rt.rt_nr_migratory++;
}
+
+ update_rt_migration(&rq->rt);
}
/* Assumes rq->lock is held */
@@ -1853,11 +1932,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (p->on_rq && !rq->rt.rt_nr_running)
- pull_rt_task(rq);
+ if (!p->on_rq || rq->rt.rt_nr_running)
+ return;
+
+ if (pull_rt_task(rq))
+ resched_task(rq->curr);
}
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
{
unsigned int i;
@@ -1886,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->rt.overloaded && push_rt_task(rq) &&
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
- rq != task_rq(p))
+ push_rt_task(rq) && rq != task_rq(p))
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
@@ -1949,7 +2031,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
if (soft != RLIM_INFINITY) {
unsigned long next;
- p->rt.timeout++;
+ if (p->rt.watchdog_stamp != jiffies) {
+ p->rt.timeout++;
+ p->rt.watchdog_stamp = jiffies;
+ }
+
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
if (p->rt.timeout > next)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -1958,6 +2044,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+
update_curr_rt(rq);
watchdog(rq, p);
@@ -1972,15 +2060,18 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
if (--p->rt.time_slice)
return;
- p->rt.time_slice = DEF_TIMESLICE;
+ p->rt.time_slice = sched_rr_timeslice;
/*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
+ * Requeue to the end of queue if we (and all of our ancestors) are not
+ * the only element on the queue
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
+ }
}
}
@@ -1988,7 +2079,7 @@ static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
- p->se.exec_start = rq->clock_task;
+ p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
@@ -2000,7 +2091,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
* Time slice is 0 for SCHED_FIFO tasks
*/
if (task->policy == SCHED_RR)
- return DEF_TIMESLICE;
+ return sched_rr_timeslice;
else
return 0;
}
@@ -2022,7 +2113,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db..31cc02ebc54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,57 +1,90 @@
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/tick.h>
+#include <linux/slab.h>
#include "cpupri.h"
+#include "cpudeadline.h"
+#include "cpuacct.h"
+
+struct rq;
extern __read_mostly int scheduler_running;
-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
+extern unsigned long calc_load_update;
+extern atomic_long_t calc_load_tasks;
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+extern long calc_load_fold_active(struct rq *this_rq);
+extern void update_cpu_load_active(struct rq *this_rq);
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
+# define SCHED_LOAD_RESOLUTION 10
+# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION 0
+# define scale_load(w) (w)
+# define scale_load_down(w) (w)
+#endif
+
+#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
/*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9 -> just above 0.5us
+ */
+#define DL_SCALE (10)
+
+/*
* These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
*/
-#define DEF_TIMESLICE (100 * HZ / 1000)
/*
* single value that denotes runtime == period, ie unlimited time.
*/
#define RUNTIME_INF ((u64)~0ULL)
+static inline int fair_policy(int policy)
+{
+ return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
static inline int rt_policy(int policy)
{
- if (policy == SCHED_FIFO || policy == SCHED_RR)
- return 1;
- return 0;
+ return policy == SCHED_FIFO || policy == SCHED_RR;
+}
+
+static inline int dl_policy(int policy)
+{
+ return policy == SCHED_DEADLINE;
}
static inline int task_has_rt_policy(struct task_struct *p)
@@ -59,6 +92,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
return rt_policy(p->policy);
}
+static inline int task_has_dl_policy(struct task_struct *p)
+{
+ return dl_policy(p->policy);
+}
+
+static inline bool dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+/*
+ * Tells if entity @a should preempt entity @b.
+ */
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+{
+ return dl_time_before(a->deadline, b->deadline);
+}
+
/*
* This is the priority-queue data structure of the RT scheduling class:
*/
@@ -74,6 +126,47 @@ struct rt_bandwidth {
u64 rt_runtime;
struct hrtimer rt_period_timer;
};
+/*
+ * To keep the bandwidth of -deadline tasks and groups under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of the system (the group);
+ * - cache the fraction of that bandwidth that is currently allocated.
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * meaning that:
+ * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ * - dl_total_bw array contains, in the i-eth element, the currently
+ * allocated bandwidth on the i-eth CPU.
+ * Moreover, groups consume bandwidth on each CPU, while tasks only
+ * consume bandwidth on the CPU they're running on.
+ * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
+ * that will be shown the next time the proc or cgroup controls will
+ * be red. It on its turn can be changed by writing on its own
+ * control.
+ */
+struct dl_bandwidth {
+ raw_spinlock_t dl_runtime_lock;
+ u64 dl_runtime;
+ u64 dl_period;
+};
+
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+extern struct dl_bw *dl_bw_of(int i);
+
+struct dl_bw {
+ raw_spinlock_t lock;
+ u64 bw, total_bw;
+};
extern struct mutex sched_domains_mutex;
@@ -84,7 +177,7 @@ extern struct mutex sched_domains_mutex;
struct cfs_rq;
struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -115,7 +208,10 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
- atomic_t load_weight;
+#ifdef CONFIG_SMP
+ atomic_long_t load_avg;
+ atomic_t runnable_avg;
+#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -154,11 +250,6 @@ struct task_group {
#define MAX_SHARES (1UL << 18)
#endif
-/* Default task group.
- * Every task in system belong to this group at bootup.
- */
-extern struct task_group root_task_group;
-
typedef int (*tg_visitor)(struct task_group *, void *);
extern int walk_tg_tree_from(struct task_group *from,
@@ -187,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void free_rt_sched_group(struct task_group *tg);
@@ -196,6 +287,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
+
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
@@ -205,7 +308,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running, h_nr_running;
+ unsigned int nr_running, h_nr_running;
u64 exec_clock;
u64 min_vruntime;
@@ -216,9 +319,6 @@ struct cfs_rq {
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
- struct list_head tasks;
- struct list_head *balance_iterator;
-
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
@@ -229,26 +329,22 @@ struct cfs_rq {
unsigned int nr_spread_over;
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
- /*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
- */
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
-
#ifdef CONFIG_SMP
/*
- * the part of load.weight contributed by tasks
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
*/
- unsigned long task_weight;
+ unsigned long runnable_load_avg, blocked_load_avg;
+ atomic64_t decay_counter;
+ u64 last_decay;
+ atomic_long_t removed_load;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* Required to track per-cpu representation of a task_group */
+ u32 tg_runnable_contrib;
+ unsigned long tg_load_contrib;
/*
* h_load = weight * f(tg)
@@ -257,26 +353,33 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/*
- * Maintaining per-cpu shares distribution for group scheduling
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
*
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
*/
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
- unsigned long load_contribution;
-#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
- u64 throttled_timestamp;
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -291,7 +394,7 @@ static inline int rt_bandwidth_enabled(void)
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
- unsigned long rt_nr_running;
+ unsigned int rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
@@ -306,6 +409,8 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
+ int rt_queued;
+
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -316,11 +421,45 @@ struct rt_rq {
unsigned long rt_nr_boosted;
struct rq *rq;
- struct list_head leaf_rt_rq_list;
struct task_group *tg;
#endif
};
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+ /* runqueue is an rbtree, ordered by deadline */
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+
+ unsigned long dl_nr_running;
+
+#ifdef CONFIG_SMP
+ /*
+ * Deadline values of the currently executing and the
+ * earliest ready task on this rq. Caching these facilitates
+ * the decision wether or not a ready but not running task
+ * should migrate somewhere else.
+ */
+ struct {
+ u64 curr;
+ u64 next;
+ } earliest_dl;
+
+ unsigned long dl_nr_migratory;
+ int overloaded;
+
+ /*
+ * Tasks on this rq that can be pushed away. They are kept in
+ * an rb-tree, ordered by tasks' deadlines, with caching
+ * of the leftmost (earliest deadline) element.
+ */
+ struct rb_root pushable_dl_tasks_root;
+ struct rb_node *pushable_dl_tasks_leftmost;
+#else
+ struct dl_bw dl_bw;
+#endif
+};
+
#ifdef CONFIG_SMP
/*
@@ -339,6 +478,15 @@ struct root_domain {
cpumask_var_t online;
/*
+ * The bit corresponding to a CPU gets set here if such CPU has more
+ * than one runnable -deadline task (as it is below for RT tasks).
+ */
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
+
+ /*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
@@ -365,14 +513,21 @@ struct rq {
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
- unsigned long nr_running;
+ unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ unsigned long last_sched_tick;
+#endif
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
@@ -382,14 +537,14 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
+ struct dl_rq dl;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- struct list_head leaf_rt_rq_list;
-#endif
+
+ struct sched_avg avg;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* This is part of a global counter where only the total sum
@@ -412,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
- unsigned long cpu_power;
+ unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
@@ -424,10 +579,15 @@ struct rq {
int cpu;
int online;
+ struct list_head cfs_tasks;
+
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
+
+ /* This is used to determine avg_idle's max value */
+ u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -462,7 +622,6 @@ struct rq {
unsigned int yld_count;
/* schedule() stats */
- unsigned int sched_switch;
unsigned int sched_count;
unsigned int sched_goidle;
@@ -493,8 +652,26 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))
+static inline u64 rq_clock(struct rq *rq)
+{
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ return rq->clock_task;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
+
#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -534,8 +711,87 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
return hsd;
}
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+
+struct sched_group_capacity {
+ atomic_t ref;
+ /*
+ * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * for a single CPU.
+ */
+ unsigned int capacity, capacity_orig;
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
+ /*
+ * Number of busy cpus in this group.
+ */
+ atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_capacity *sgc;
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgc->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_cpus(group));
+}
+
+extern int group_balance_cpu(struct sched_group *sg);
+
+#else
+
+static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
@@ -547,22 +803,19 @@ DECLARE_PER_CPU(int, sd_llc_id);
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_css() and friends because the cgroup subsystem
+ * changes that value before the cgroup_subsys::attach() method is called,
+ * therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -604,6 +857,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
+ p->wake_cpu = cpu;
#endif
}
@@ -611,7 +865,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
-# include <linux/jump_label.h>
+# include <linux/static_key.h>
# define const_debug __read_mostly
#else
# define const_debug const
@@ -630,18 +884,18 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct jump_label_key *key)
+static __always_inline bool static_branch__true(struct static_key *key)
{
- return likely(static_branch(key)); /* Not out of line branch. */
+ return static_key_true(key); /* Not out of line branch. */
}
-static __always_inline bool static_branch__false(struct jump_label_key *key)
+static __always_inline bool static_branch__false(struct static_key *key)
{
- return unlikely(static_branch(key)); /* Out of line branch. */
+ return static_key_false(key); /* Out of line branch. */
}
#define SCHED_FEAT(name, enabled) \
-static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
return static_branch__##enabled(key); \
}
@@ -650,12 +904,24 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
#undef SCHED_FEAT
-extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -669,8 +935,6 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
-
-
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
@@ -692,6 +956,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -742,11 +1009,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
*/
next->on_cpu = 1;
#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- raw_spin_unlock_irq(&rq->lock);
-#else
raw_spin_unlock(&rq->lock);
-#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -760,30 +1023,16 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
smp_wmb();
prev->on_cpu = 0;
#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
-#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
- lw->weight += inc;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
- lw->weight -= dec;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
- lw->weight = w;
- lw->inv_weight = 0;
-}
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x4 /* internal use, task got migrated */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -838,20 +1087,85 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
- CPUACCT_STAT_USER, /* ... user mode */
- CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+#define ENQUEUE_WAKEUP 1
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
+#define ENQUEUE_REPLENISH 8
+
+#define DEQUEUE_SLEEP 1
+
+#define RETRY_TASK ((void *)-1UL)
+
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
+ */
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+
+ void (*post_schedule) (struct rq *this_rq);
+ void (*task_waking) (struct task_struct *task);
+ void (*task_woken) (struct rq *this_rq, struct task_struct *task);
- CPUACCT_STAT_NSTATS,
+ void (*set_cpus_allowed)(struct task_struct *p,
+ const struct cpumask *newmask);
+
+ void (*rq_online)(struct rq *rq);
+ void (*rq_offline)(struct rq *rq);
+#endif
+
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+ void (*task_fork) (struct task_struct *p);
+ void (*task_dead) (struct task_struct *p);
+
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ int oldprio);
+
+ unsigned int (*get_rr_interval) (struct rq *rq,
+ struct task_struct *task);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
};
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ prev->sched_class->put_prev_task(rq, prev);
+}
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
@@ -859,24 +1173,28 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
-extern void trigger_load_balance(struct rq *rq, int cpu);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
-#else /* CONFIG_SMP */
+extern void trigger_load_balance(struct rq *rq);
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+
+#else
+
+static inline void idle_enter_fair(struct rq *rq) { }
+static inline void idle_exit_fair(struct rq *rq) { }
#endif
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
-extern void update_group_power(struct sched_domain *sd, int cpu);
-extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
+
+extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
+extern void init_sched_dl_class(void);
extern void resched_task(struct task_struct *p);
extern void resched_cpu(int cpu);
@@ -884,52 +1202,43 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void update_cpu_load(struct rq *this_rq);
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct kernel_cpustat __percpu *cpustat;
-};
+unsigned long to_ratio(u64 period, u64 runtime);
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
-}
+extern void update_idle_cpu_load(struct rq *this_rq);
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
+extern void init_task_runnable_average(struct task_struct *p);
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+static inline void add_nr_running(struct rq *rq, unsigned count)
{
- if (!ca || !ca->css.cgroup->parent)
- return NULL;
- return cgroup_ca(ca->css.cgroup->parent);
-}
+ unsigned prev_nr = rq->nr_running;
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+ rq->nr_running = prev_nr + count;
+
+#ifdef CONFIG_NO_HZ_FULL
+ if (prev_nr < 2 && rq->nr_running >= 2) {
+ if (tick_nohz_full_cpu(rq->cpu)) {
+ /* Order rq->nr_running write against the IPI */
+ smp_wmb();
+ smp_send_reschedule(rq->cpu);
+ }
+ }
#endif
+}
-static inline void inc_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running++;
+ rq->nr_running -= count;
}
-static inline void dec_nr_running(struct rq *rq)
+static inline void rq_last_tick_reset(struct rq *rq)
{
- rq->nr_running--;
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = jiffies;
+#endif
}
extern void update_rq_clock(struct rq *rq);
@@ -948,8 +1257,6 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
-void calc_load_account_idle(struct rq *this_rq);
-
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1067,6 +1374,33 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ raw_spin_lock(l1);
+ raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
/*
* double_rq_lock - safely lock two runqueues
*
@@ -1151,16 +1485,65 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
NOHZ_TICK_STOPPED,
NOHZ_BALANCE_KICK,
- NOHZ_IDLE,
};
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(u64, cpu_softirq_time);
+
+#ifndef CONFIG_64BIT
+DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 2a581ba8e19..a476bea17fb 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,20 +21,23 @@ static int show_schedstat(struct seq_file *seq, void *v)
if (mask_str == NULL)
return -ENOMEM;
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+ "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
cpu, rq->yld_count,
- rq->sched_switch, rq->sched_count, rq->sched_goidle,
+ rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
@@ -77,30 +80,61 @@ static int show_schedstat(struct seq_file *seq, void *v)
return 0;
}
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
+ unsigned long n = *offset;
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &schedstat_sops);
}
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release,
};
static int __init proc_schedstat_init(void)
@@ -108,4 +142,4 @@ static int __init proc_schedstat_init(void)
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0;
}
-module_init(proc_schedstat_init);
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5..4ab70433965 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = task_rq(t)->clock, delta = 0;
+ unsigned long long now = rq_clock(rq), delta = 0;
if (unlikely(sched_info_on()))
if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(task_rq(t), delta);
+ rq_sched_info_dequeued(rq, delta);
}
/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = task_rq(t)->clock, delta = 0;
+ unsigned long long now = rq_clock(rq), delta = 0;
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- rq_sched_info_arrive(task_rq(t), delta);
+ rq_sched_info_arrive(rq, delta);
}
/*
@@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued)
- t->sched_info.last_queued = task_rq(t)->clock;
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily. Now we can calculate how long we ran.
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
*/
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
- unsigned long long delta = task_rq(t)->clock -
+ unsigned long long delta = rq_clock(rq) -
t->sched_info.last_arrival;
- rq_sched_info_depart(task_rq(t), delta);
+ rq_sched_info_depart(rq, delta);
if (t->state == TASK_RUNNING)
- sched_info_queued(t);
+ sched_info_queued(rq, t);
}
/*
@@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
{
- struct rq *rq = task_rq(prev);
-
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
- sched_info_depart(prev);
+ sched_info_depart(rq, prev);
if (next != rq->idle)
- sched_info_arrive(next);
+ sched_info_arrive(rq, next);
}
static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
- __sched_info_switch(prev, next);
+ __sched_info_switch(rq, prev, next);
}
#else
-#define sched_info_queued(t) do { } while (0)
+#define sched_info_queued(rq, t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
-#define sched_info_dequeued(t) do { } while (0)
-#define sched_info_switch(t, next) do { } while (0)
+#define sched_info_dequeued(rq, t) do { } while (0)
+#define sched_info_depart(rq, t) do { } while (0)
+#define sched_info_arrive(rq, next) do { } while (0)
+#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
@@ -162,6 +165,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
*/
/**
+ * cputimer_running - return true if cputimer is running
+ *
+ * @tsk: Pointer to target task.
+ */
+static inline bool cputimer_running(struct task_struct *tsk)
+
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer->running)
+ return false;
+
+ /*
+ * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+ * in __exit_signal(), we won't account to the signal struct further
+ * cputime consumed by that task, even though the task can still be
+ * ticking after __exit_signal().
+ *
+ * In order to keep a consistent behaviour between thread group cputime
+ * and thread group cputimer accounting, lets also ignore the cputime
+ * elapsing after __exit_signal() in any thread group timer running.
+ *
+ * This makes sure that POSIX CPU clocks and timers are synchronized, so
+ * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+ * clock delta is behind the expiring timer value.
+ */
+ if (unlikely(!tsk->sighand))
+ return false;
+
+ return true;
+}
+
+/**
* account_group_user_time - Maintain utime for a thread group.
*
* @tsk: Pointer to task structure.
@@ -176,7 +212,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
@@ -199,7 +235,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
@@ -222,7 +258,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ if (!cputimer_running(tsk))
return;
raw_spin_lock(&cputimer->lock);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd2..bfe0edadbfb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -23,26 +23,31 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq)
- return stop;
+ if (!stop || !stop->on_rq)
+ return NULL;
- return NULL;
+ put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
}
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)
@@ -52,6 +57,21 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +80,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
static void set_curr_task_stop(struct rq *rq)
{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq_clock_task(rq);
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
@@ -83,7 +106,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
- .next = &rt_sched_class,
+ .next = &dl_sched_class,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
new file mode 100644
index 00000000000..0ffa20ae657
--- /dev/null
+++ b/kernel/sched/wait.c
@@ -0,0 +1,504 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 Nadia Yvette Chambers, Oracle
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+{
+ spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(__init_waitqueue_head);
+
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue_tail(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __remove_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, wake_flags, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+ __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+ int wake_flags = 1; /* XXX WF_SYNC */
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the critical region).
+ */
+void
+prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue_tail(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ wait->private = current;
+ wait->func = autoremove_wake_function;
+
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list)) {
+ if (wait->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_tail(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ }
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
+/**
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ /*
+ * We can check for list emptiness outside the lock
+ * IFF:
+ * - we use the "careful" check that verifies both
+ * the next and prev pointers, so that there cannot
+ * be any half-pending updates in progress on other
+ * CPU's that we haven't seen yet (and that might
+ * still change the stack area.
+ * and
+ * - all other users take the lock (ie we can only
+ * have _one_ other CPU that looks at or modifies
+ * the list).
+ */
+ if (!list_empty_careful(&wait->task_list)) {
+ spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_wait);
+
+/**
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @mode: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and no one wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_locked_key(q, mode, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wait, mode, sync, key);
+
+ if (ret)
+ list_del_init(&wait->task_list);
+ return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+ else
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(void *), unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ if (test_bit(q->key.bit_nr, q->key.flags))
+ ret = (*action)(q->key.flags);
+ } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+ int (*action)(void *), unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched
+__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(void *), unsigned mode)
+{
+ do {
+ int ret;
+
+ prepare_to_wait_exclusive(wq, &q->wait, mode);
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(q->key.flags);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
+ } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+ finish_wait(wq, &q->wait);
+ return 0;
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+ int (*action)(void *), unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit_lock(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ const struct zone *zone = page_zone(virt_to_page(word));
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+ if (BITS_PER_LONG == 64) {
+ unsigned long q = (unsigned long)p;
+ return bit_waitqueue((void *)(q & ~1), q & 1);
+ }
+ return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+ void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+ atomic_t *val = key->flags;
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ atomic_read(val) != 0)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(atomic_t *), unsigned mode)
+{
+ atomic_t *val;
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ val = q->key.flags;
+ if (atomic_read(val) == 0)
+ break;
+ ret = (*action)(val);
+ } while (!ret && atomic_read(val) != 0);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p) \
+ struct wait_bit_queue name = { \
+ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
+ .wait = { \
+ .private = current, \
+ .func = wake_atomic_t_function, \
+ .task_list = \
+ LIST_HEAD_INIT((name).wait.task_list), \
+ }, \
+ }
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+ unsigned mode)
+{
+ wait_queue_head_t *wq = atomic_t_waitqueue(p);
+ DEFINE_WAIT_ATOMIC_T(wait, p);
+
+ return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+ __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);