diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 9 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 14 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 136 | ||||
| -rw-r--r-- | kernel/sched/completion.c | 299 | ||||
| -rw-r--r-- | kernel/sched/core.c | 5003 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.c | 283 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.h | 17 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 229 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.h | 33 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 32 | ||||
| -rw-r--r-- | kernel/sched/cpupri.h | 2 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 838 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 1676 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 255 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 5041 | ||||
| -rw-r--r-- | kernel/sched/features.h | 53 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 273 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 13 | ||||
| -rw-r--r-- | kernel/sched/proc.c | 591 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 554 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 725 | ||||
| -rw-r--r-- | kernel/sched/stats.c | 80 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 90 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 39 | ||||
| -rw-r--r-- | kernel/sched/wait.c | 504 |
25 files changed, 12251 insertions, 4538 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a..ab32b7b0db5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,10 +11,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o -obj-$(CONFIG_SMP) += cpupri.o +obj-y += core.o proc.o clock.o cputime.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += wait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o - - +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e..e73efba9830 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif + sched_offline_group(ag->tg); sched_destroy_group(ag->tg); } @@ -95,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) #endif tg->autogroup = ag; + sched_online_group(tg, &root_task_group); return ag; out_free: @@ -195,20 +197,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (*nice < -20 || *nice > 19) + if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; - err = security_task_setnice(current, *nice); + err = security_task_setnice(current, nice); if (err) return err; - if (*nice < 0 && !can_nice(current, *nice)) + if (nice < 0 && !can_nice(current, nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +221,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); if (!err) - ag->nice = *nice; + ag->nice = nice; up_write(&ag->lock); autogroup_kref_put(ag); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492d..3ef6451e972 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -26,9 +26,10 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) * local_clock() -- is cpu_clock() on the current cpu. * + * sched_clock_cpu(i) + * * How: * * The implementation either uses sched_clock() when @@ -50,15 +51,6 @@ * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -66,13 +58,16 @@ #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/static_key.h> +#include <linux/workqueue.h> +#include <linux/compiler.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -82,7 +77,52 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +static struct static_key __sched_clock_stable = STATIC_KEY_INIT; +static int __sched_clock_stable_early; + +int sched_clock_stable(void) +{ + return static_key_false(&__sched_clock_stable); +} + +static void __set_sched_clock_stable(void) +{ + if (!sched_clock_stable()) + static_key_slow_inc(&__sched_clock_stable); +} + +void set_sched_clock_stable(void) +{ + __sched_clock_stable_early = 1; + + smp_mb(); /* matches sched_clock_init() */ + + if (!sched_clock_running) + return; + + __set_sched_clock_stable(); +} + +static void __clear_sched_clock_stable(struct work_struct *work) +{ + /* XXX worry about clock continuity */ + if (sched_clock_stable()) + static_key_slow_dec(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + +void clear_sched_clock_stable(void) +{ + __sched_clock_stable_early = 0; + + smp_mb(); /* matches sched_clock_init() */ + + if (!sched_clock_running) + return; + + schedule_work(&sched_clock_work); +} struct sched_clock_data { u64 tick_raw; @@ -116,6 +156,20 @@ void sched_clock_init(void) } sched_clock_running = 1; + + /* + * Ensure that it is impossible to not do a static_key update. + * + * Either {set,clear}_sched_clock_stable() must see sched_clock_running + * and do the update, or we must see their __sched_clock_stable_early + * and do the update, or both. + */ + smp_mb(); /* matches {set,clear}_sched_clock_stable() */ + + if (__sched_clock_stable_early) + __set_sched_clock_stable(); + else + __clear_sched_clock_stable(NULL); } /* @@ -176,10 +230,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks @@ -216,20 +296,20 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd; u64 clock; - WARN_ON_ONCE(!irqs_disabled()); - - if (sched_clock_stable) + if (sched_clock_stable()) return sched_clock(); if (unlikely(!sched_clock_running)) return 0ull; + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); + preempt_enable_notrace(); return clock; } @@ -239,7 +319,7 @@ void sched_clock_tick(void) struct sched_clock_data *scd; u64 now, now_gtod; - if (sched_clock_stable) + if (sched_clock_stable()) return; if (unlikely(!sched_clock_running)) @@ -290,14 +370,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - u64 clock; - unsigned long flags; + if (!sched_clock_stable()) + return sched_clock_cpu(cpu); - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); - - return clock; + return sched_clock(); } /* @@ -309,14 +385,10 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); - local_irq_restore(flags); + if (!sched_clock_stable()) + return sched_clock_cpu(raw_smp_processor_id()); - return clock; + return sched_clock(); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ @@ -336,12 +408,12 @@ u64 sched_clock_cpu(int cpu) u64 cpu_clock(int cpu) { - return sched_clock_cpu(cpu); + return sched_clock(); } u64 local_clock(void) { - return sched_clock_cpu(0); + return sched_clock(); } #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 00000000000..a63f4dc2790 --- /dev/null +++ b/kernel/sched/completion.c @@ -0,0 +1,299 @@ +/* + * Generic wait-for-completion handler; + * + * It differs from semaphores in that their default case is the opposite, + * wait_for_completion default blocks whereas semaphore default non-block. The + * interface also makes it easy to 'complete' multiple waiting threads, + * something which isn't entirely natural for semaphores. + * + * But more importantly, the primitive documents the usage. Semaphores would + * typically be used for exclusion which gives rise to priority inversion. + * Waiting for completion is a typically sync point, but not an exclusion point. + */ + +#include <linux/sched.h> +#include <linux/completion.h> + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_locked(&x->wait, TASK_NORMAL, 1); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_locked(&x->wait, TASK_NORMAL, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = action(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, action, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Return: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Return: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e05..bc1638b3344 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -71,7 +71,11 @@ #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> +#include <linux/binfmts.h> +#include <linux/context_tracking.h> +#include <linux/compiler.h> +#include <asm/switch_to.h> #include <asm/tlb.h> #include <asm/irq_regs.h> #include <asm/mutex.h> @@ -80,11 +84,28 @@ #endif #include "sched.h" -#include "../workqueue_sched.h" +#include "../workqueue_internal.h" +#include "../smpboot.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ + smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ + smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -139,9 +160,8 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT @@ -162,13 +182,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,37 +196,24 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -226,6 +233,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -285,8 +313,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; - - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -359,13 +385,6 @@ static struct rq *this_rq_lock(void) #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. */ static void hrtick_clear(struct rq *rq) @@ -393,6 +412,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) } #ifdef CONFIG_SMP + +static int __hrtick_restart(struct rq *rq) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = hrtimer_get_softexpires(timer); + + return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); +} + /* * called from hardirq (IPI) context */ @@ -401,7 +429,7 @@ static void __hrtick_start(void *arg) struct rq *rq = arg; raw_spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); + __hrtick_restart(rq); rq->hrtick_csd_pending = 0; raw_spin_unlock(&rq->lock); } @@ -419,9 +447,9 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_set_expires(timer, time); if (rq == this_rq()) { - hrtimer_restart(timer); + __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } @@ -494,37 +522,98 @@ static inline void init_hrtick(void) #endif /* CONFIG_SCHED_HRTICK */ /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +/* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - void resched_task(struct task_struct *p) { int cpu; - assert_raw_spin_locked(&task_rq(p)->lock); + lockdep_assert_held(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); - if (cpu == smp_processor_id()) + + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); + set_preempt_need_resched(); return; + } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -538,7 +627,8 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -547,12 +637,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { @@ -576,56 +669,87 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); if (cpu == smp_processor_id()) return; - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; + if (set_nr_and_not_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); +static bool wake_up_full_nohz_cpu(int cpu) +{ + if (tick_nohz_full_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) - smp_send_reschedule(cpu); + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_full_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); } static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); - while ((s64)(rq->clock - rq->age_stamp) > period) { + while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { /* * Inline assembly required to prevent the compiler * optimising this loop into a divmod call. @@ -637,12 +761,6 @@ void sched_avg_update(struct rq *rq) } } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -690,8 +808,6 @@ int tg_nop(struct task_group *tg, void *data) } #endif -void update_cpu_load(struct rq *this_rq); - static void set_load_weight(struct task_struct *p) { int prio = p->static_prio - MAX_RT_PRIO; @@ -713,14 +829,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(p); + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(p); + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -740,126 +856,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static void update_rq_clock_task(struct rq *rq, s64 delta) { /* @@ -894,20 +890,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { - u64 st; - + if (static_key_false((¶virt_steal_rq_enabled))) { steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif @@ -915,48 +905,11 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) sched_rt_avg_update(rq, irq_delta + steal); #endif } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -1006,7 +959,9 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (task_has_rt_policy(p)) + if (task_has_dl_policy(p)) + prio = MAX_DL_PRIO-1; + else if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -1036,6 +991,8 @@ static int effective_prio(struct task_struct *p) /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. + * + * Return: 1 if the task is currently executing. 0 otherwise. */ inline int task_curr(const struct task_struct *p) { @@ -1050,7 +1007,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class->switched_from) prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) + } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); } @@ -1088,7 +1045,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_preempt_count(p) & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1096,7 +1053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -1109,6 +1066,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } @@ -1116,6 +1075,108 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ + if (p->on_rq) { + struct rq *src_rq, *dst_rq; + + src_rq = task_rq(p); + dst_rq = cpu_rq(cpu); + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); + check_preempt_curr(dst_rq, p, 0); + } else { + /* + * Task isn't running anymore; make it appear like we migrated + * it before it went to sleep. This means on wakeup we make the + * previous cpu our targer instead of where it really is. + */ + p->wake_cpu = cpu; + } +} + +struct migration_swap_arg { + struct task_struct *src_task, *dst_task; + int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ + struct migration_swap_arg *arg = data; + struct rq *src_rq, *dst_rq; + int ret = -EAGAIN; + + src_rq = cpu_rq(arg->src_cpu); + dst_rq = cpu_rq(arg->dst_cpu); + + double_raw_lock(&arg->src_task->pi_lock, + &arg->dst_task->pi_lock); + double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) + goto unlock; + + if (task_cpu(arg->src_task) != arg->src_cpu) + goto unlock; + + if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + goto unlock; + + if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + goto unlock; + + __migrate_swap_task(arg->src_task, arg->dst_cpu); + __migrate_swap_task(arg->dst_task, arg->src_cpu); + + ret = 0; + +unlock: + double_rq_unlock(src_rq, dst_rq); + raw_spin_unlock(&arg->dst_task->pi_lock); + raw_spin_unlock(&arg->src_task->pi_lock); + + return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ + struct migration_swap_arg arg; + int ret = -EINVAL; + + arg = (struct migration_swap_arg){ + .src_task = cur, + .src_cpu = task_cpu(cur), + .dst_task = p, + .dst_cpu = task_cpu(p), + }; + + if (arg.src_cpu == arg.dst_cpu) + goto out; + + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ + if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) + goto out; + + if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + goto out; + + if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + goto out; + + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); + ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: + return ret; +} + struct migration_arg { struct task_struct *task; int dest_cpu; @@ -1263,29 +1324,69 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; + enum { cpuset, possible, fail } state = cpuset; int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); - - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - return dest_cpu; - - /* No more Mr. Nice Guy. */ - dest_cpu = cpuset_cpus_allowed_fallback(p); /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); + if (nid != -1) { + nodemask = cpumask_of_node(nid); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } + } + + for (;;) { + /* Any allowed, online CPU? */ + for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + goto out; + } + + switch (state) { + case cpuset: + /* No more Mr. Nice Guy. */ + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + + case possible: + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; + + case fail: + BUG(); + break; + } + } + +out: + if (state != cpuset) { + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk_deferred("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } } return dest_cpu; @@ -1295,9 +1396,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1379,8 +1480,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); + trace_sched_wakeup(p, true); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -1388,13 +1489,14 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + u64 delta = rq_clock(rq) - rq->idle_stamp; + u64 max = 2*rq->max_idle_balance_cost; - if (delta > max) + update_avg(&rq->avg_idle, delta); + + if (rq->avg_idle > max) rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; } #endif @@ -1425,6 +1527,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p); if (p->on_rq) { + /* check_preempt_curr() may use rq clock */ + update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } @@ -1434,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; + unsigned long flags; - raw_spin_lock(&rq->lock); + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1448,12 +1556,21 @@ static void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } - raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); + + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()) return; /* @@ -1470,12 +1587,13 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { + if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } @@ -1484,30 +1602,17 @@ void scheduler_ipi(void) static void ttwu_queue_remote(struct task_struct *p, int cpu) { - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); -} - -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -static int ttwu_activate_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; + struct rq *rq = cpu_rq(cpu); - rq = __task_rq_lock(p); - if (p->on_cpu) { - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } - __task_rq_unlock(rq); - - return ret; - } -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -static inline int ttwu_share_cache(int this_cpu, int that_cpu) +bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -1518,7 +1623,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -1542,7 +1647,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * Returns %true if @p was woken up, %false if it was already running + * Return: %true if @p was woken up, %false if it was already running. * or @state didn't match @p's state. */ static int @@ -1551,7 +1656,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; - smp_wmb(); + /* + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be + * reordered with p->state check below. This pairs with mb() in + * set_current_state() the waiting thread does. + */ + smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; @@ -1567,21 +1678,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ - while (p->on_cpu) { -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - /* - * In case the architecture enables interrupts in - * context_switch(), we cannot busy wait, since that - * would lead to deadlocks when an interrupt hits and - * tries to wake up @prev. So bail and do a complete - * remote wakeup. - */ - if (ttwu_activate_remote(p, wake_flags)) - goto stat; -#else + while (p->on_cpu) cpu_relax(); -#endif - } /* * Pairs with the smp_wmb() in finish_lock_switch(). */ @@ -1593,7 +1691,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -1621,8 +1719,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { @@ -1648,15 +1748,17 @@ out: * @p: The process to be woken up. * * Attempt to wake up the nominated process and move it to the set of runnable - * processes. Returns 1 if the process was woken up, 0 if it was already - * running. + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1671,7 +1773,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -1687,22 +1789,94 @@ static void __sched_fork(struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif + RB_CLEAR_NODE(&p->dl.rb_node); + hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + p->dl.dl_runtime = p->dl.runtime = 0; + p->dl.dl_deadline = p->dl.deadline = 0; + p->dl.dl_period = 0; + p->dl.flags = 0; + INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + p->mm->numa_scan_seq = 0; + } + + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; +#endif /* CONFIG_NUMA_BALANCING */ +} + +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ + numabalancing_enabled = enabled; } +#endif /* CONFIG_SCHED_DEBUG */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = numabalancing_enabled; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_numabalancing_state(state); + return err; +} +#endif +#endif /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -1719,7 +1893,7 @@ void sched_fork(struct task_struct *p) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; @@ -1736,8 +1910,14 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - if (!rt_prio(p->prio)) + if (dl_prio(p->prio)) { + put_cpu(); + return -EAGAIN; + } else if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } else { p->sched_class = &fair_sched_class; + } if (p->sched_class->task_fork) p->sched_class->task_fork(p); @@ -1760,18 +1940,128 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif + init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif put_cpu(); + return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } /* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period ?: attr->sched_deadline; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + if (new_bw == p->dl.dl_bw) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + +/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping @@ -1790,9 +2080,11 @@ void wake_up_new_task(struct task_struct *p) * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ - set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); + set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + /* Initialize new task's runnable average */ + init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; @@ -1832,9 +2124,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister); static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } @@ -1843,9 +2134,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_out(notifier, next); } @@ -1880,12 +2170,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - sched_info_switch(prev, next); + trace_sched_switch(prev, next); + sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); - trace_sched_switch(prev, next); } /** @@ -1923,21 +2213,19 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_state = prev->state; + vtime_task_switch(prev); finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); - trace_sched_stat_sleeptime(current, rq->clock); + finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -1945,17 +2233,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { @@ -1973,10 +2256,6 @@ static inline void post_schedule(struct rq *rq) #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - static inline void post_schedule(struct rq *rq) { } @@ -1987,7 +2266,7 @@ static inline void post_schedule(struct rq *rq) * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); @@ -2050,6 +2329,7 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2063,11 +2343,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -2079,23 +2358,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; @@ -2123,385 +2385,6 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); - -static long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ -/* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_tasks_idle; - -void calc_load_account_idle(struct rq *this_rq) -{ - long delta; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); -} - -static long calc_load_fold_idle(void) -{ - long delta = 0; - - /* - * Its got a race, we don't care... - */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(unsigned long ticks) -{ - long delta, active, n; - - if (time_before(jiffies, calc_load_update)) - return; - - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * If we were idle for multiple load cycles, apply them. - */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. - * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. - */ -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} - -static inline long calc_load_fold_idle(void) -{ - return 0; -} - -static void calc_global_nohz(unsigned long ticks) -{ -} -#endif - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active; - - calc_global_nohz(ticks); - - if (time_before(jiffies, calc_load_update + 10)) - return; - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -void update_cpu_load(struct rq *this_rq) -{ - unsigned long this_load = this_rq->load.weight; - unsigned long curr_jiffies = jiffies; - unsigned long pending_updates; - int i, scale; - - this_rq->nr_load_updates++; - - /* Avoid repeated calls on same jiffy, when moving in and out of idle */ - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -static void update_cpu_load_active(struct rq *this_rq) -{ - update_cpu_load(this_rq); - - calc_load_account_active(this_rq); -} - #ifdef CONFIG_SMP /* @@ -2515,7 +2398,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2550,7 +2433,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock_task - p->se.exec_start; + ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) ns = 0; } @@ -2582,404 +2465,26 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns = 0; - rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimization chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); + if (!p->on_cpu) + return p->se.sum_exec_runtime; #endif -} - - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - int index; - - /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += cputime; - - /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) -{ - /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - int index; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; - else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; - else - index = CPUTIME_SYSTEM; - - __account_system_time(p, cputime, cputime_scaled, index); -} - -/* - * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64) cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; -} - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { - u64 steal, st = 0; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; - - account_steal_time(st); - return st; - } -#endif - return false; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to - * @user_tick: is the tick from userspace - * @rq: the pointer to rq - * - * Tick demultiplexing follows the order - * - pending hardirq update - * - pending softirq update - * - user_time - * - idle_time - * - system time - * - check for guest_time - * - else account as system_time - * - * Check for hardirq is done both for system and user time as there is - * no timer going off while we are on hardirq and hence we may never get an - * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq - * softirq as those do not count in task exec_runtime any more. - */ -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (steal_account_process_tick()) - return; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; - } else if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - * Also, p->stime needs to be updated for ksoftirqd. - */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); - } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); - } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); - } -} - -static void irqtime_account_idle_ticks(int ticks) -{ - int i; - struct rq *rq = this_rq(); - - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} -#else -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, utime = p->utime, total = utime + p->stime; - - /* - * Use CFS's precise accounting: - */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else - utime = rtime; - - /* - * Compare with previous values, to keep monotonicity: - */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); - - *ut = p->prev_utime; - *st = p->prev_stime; -} - -/* - * Must be called with siglock held. - */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct signal_struct *sig = p->signal; - struct task_cputime cputime; - cputime_t rtime, utime, total; - - thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, p, &flags); - *ut = sig->prev_utime; - *st = sig->prev_stime; + return ns; } -#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2995,18 +2500,47 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + trigger_load_balance(rq); #endif + rq_last_tick_reset(rq); } +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + * + * Return: Maximum deferment in nanoseconds. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_nsecs(next - now); +} +#endif + notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { @@ -3020,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -3029,7 +2563,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - preempt_count() += val; + __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -3037,12 +2571,18 @@ void __kprobes add_preempt_count(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -3060,9 +2600,10 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; + __preempt_count_sub(val); } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); #endif @@ -3071,8 +2612,6 @@ EXPORT_SYMBOL(sub_preempt_count); */ static noinline void __schedule_bug(struct task_struct *prev) { - struct pt_regs *regs = get_irq_regs(); - if (oops_in_progress) return; @@ -3083,11 +2622,15 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - - if (regs) - show_regs(regs); - else - dump_stack(); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* @@ -3097,10 +2640,10 @@ static inline void schedule_debug(struct task_struct *prev) { /* * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. + * schedule() atomically, we ignore that path. Otherwise whine + * if we are scheduling when we should not. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) __schedule_bug(prev); rcu_sleep_check(); @@ -3109,36 +2652,40 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ - if (prev->on_rq || rq->skip_clock_update < 0) - update_rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev); -} - /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev) { - const struct sched_class *class; + const struct sched_class *class = &fair_sched_class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) - return p; + if (likely(prev->sched_class == class && + rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev); + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } +again: for_each_class(class) { - p = class->pick_next_task(rq); - if (p) + p = class->pick_next_task(rq, prev); + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; return p; + } } BUG(); /* the idle class will always have a runnable task */ @@ -3146,6 +2693,40 @@ pick_next_task(struct rq *rq) /* * __schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space */ static void __sched __schedule(void) { @@ -3166,6 +2747,12 @@ need_resched: if (sched_feat(HRTICK)) hrtick_clear(rq); + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller to avoid the race with signal_wake_up(). + */ + smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; @@ -3192,14 +2779,12 @@ need_resched: switch_count = &prev->nvcsw; } - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); - put_prev_task(rq, prev); - next = pick_next_task(rq); + next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); + clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -3221,14 +2806,14 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + if (!tsk->state || tsk_is_pi_blocked(tsk)) return; /* * If we are going to sleep and we have plugged IO queued, @@ -3238,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -3247,50 +2832,32 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage __visible void __sched schedule_user(void) { - if (lock->owner != owner) - return false; - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. */ - barrier(); - - return owner->on_cpu; + user_exit(); + schedule(); + user_enter(); } +#endif -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +void __sched schedule_preempt_disabled(void) { - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; + sched_preempt_enable_no_resched(); + schedule(); + preempt_disable(); } -#endif #ifdef CONFIG_PREEMPT /* @@ -3298,21 +2865,19 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void) { - struct thread_info *ti = current_thread_info(); - /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(!preemptible())) return; do { - add_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -3321,7 +2886,9 @@ asmlinkage void __sched notrace preempt_schedule(void) barrier(); } while (need_resched()); } +NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -3329,19 +2896,21 @@ EXPORT_SYMBOL(preempt_schedule); * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); + enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON(preempt_count() || !irqs_disabled()); + + prev_state = exception_enter(); do { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -3349,9 +2918,9 @@ asmlinkage void __sched preempt_schedule_irq(void) */ barrier(); } while (need_resched()); -} -#endif /* CONFIG_PREEMPT */ + exception_exit(prev_state); +} int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) @@ -3360,392 +2929,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ - __wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - int wake_flags = WF_SYNC; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - wake_flags = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - __wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - __add_wait_queue_tail_exclusive(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Returns: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Returns: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(completion_done); - -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* @@ -3756,19 +2939,39 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running; + int oldprio, on_rq, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; - BUG_ON(prio < 0 || prio > MAX_PRIO); + BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); + p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -3778,22 +2981,47 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->put_prev_task(rq, p); - if (rt_prio(prio)) + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || (p->pi_top_task && + dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + p->dl.dl_boosted = 1; + p->dl.dl_throttled = 0; + enqueue_flag = ENQUEUE_REPLENISH; + } else + p->dl.dl_boosted = 0; + p->sched_class = &dl_sched_class; + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (oldprio < prio) + enqueue_flag = ENQUEUE_HEAD; p->sched_class = &rt_sched_class; - else + } else { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; p->sched_class = &fair_sched_class; + } p->prio = prio; if (running) p->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif void set_user_nice(struct task_struct *p, long nice) @@ -3802,7 +3030,7 @@ void set_user_nice(struct task_struct *p, long nice) unsigned long flags; struct rq *rq; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; /* * We have to be careful, if called from sys_setpriority(), @@ -3813,9 +3041,9 @@ void set_user_nice(struct task_struct *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -3851,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); @@ -3875,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - - nice = TASK_NICE(current) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; @@ -3903,7 +3124,7 @@ SYSCALL_DEFINE1(nice, int, increment) * task_prio - return the priority value of a given task. * @p: the task in question. * - * This is the priority value as seen by users in /proc. + * Return: The priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ @@ -3913,18 +3134,10 @@ int task_prio(const struct task_struct *p) } /** - * task_nice - return the nice value of a given task. - * @p: the task in question. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. */ int idle_cpu(int cpu) { @@ -3947,6 +3160,8 @@ int idle_cpu(int cpu) /** * idle_task - return the idle task for a given cpu. * @cpu: the processor in question. + * + * Return: The idle task for the cpu @cpu. */ struct task_struct *idle_task(int cpu) { @@ -3956,26 +3171,134 @@ struct task_struct *idle_task(int cpu) /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. */ static struct task_struct *find_process_by_pid(pid_t pid) { return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) { + struct sched_dl_entity *dl_se = &p->dl; + + init_dl_task_timer(dl_se); + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; +} + +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) +{ + int policy = attr->sched_policy; + + if (policy == -1) /* setparam */ + policy = p->policy; + p->policy = policy; - p->rt_priority = prio; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); + + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); + + if (dl_prio(p->prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - set_load_weight(p); +} + +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* @@ -3988,19 +3311,20 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - if (cred->user->user_ns == pcred->user->user_ns) - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); - else - match = false; + match = (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); rcu_read_unlock(); return match; } -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user) { + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; + int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -4014,31 +3338,40 @@ recheck: reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_FIFO && policy != SCHED_RR && + if (policy != SCHED_DEADLINE && + policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; } + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + return -EINVAL; + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !can_nice(p, attr->sched_nice)) + return -EPERM; + } + if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); @@ -4048,17 +3381,26 @@ recheck: return -EPERM; /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) + if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -4095,18 +3437,25 @@ recheck: } /* - * If not changing anything there's no need to proceed further: + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. */ - if (unlikely(policy == p->policy && (!rt_policy(policy) || - param->sched_priority == p->rt_priority))) { - - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy)) + goto change; + + p->sched_reset_on_fork = reset_on_fork; + task_rq_unlock(rq, p, &flags); return 0; } +change: -#ifdef CONFIG_RT_GROUP_SCHED if (user) { +#ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. @@ -4117,8 +3466,24 @@ recheck: task_rq_unlock(rq, p, &flags); return -EPERM; } - } #endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, &p->cpus_allowed) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -4126,6 +3491,35 @@ recheck: task_rq_unlock(rq, p, &flags); goto recheck; } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; + } + + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -4133,16 +3527,18 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; - __setscheduler(rq, p, policy, param->sched_priority); + __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); + if (on_rq) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + } check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4152,21 +3548,49 @@ recheck: return 0; } +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + /* + * Fixup the legacy SCHED_RESET_ON_FORK hack + */ + if (policy & SCHED_RESET_ON_FORK) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check); +} /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * + * Return: 0 on success. An error code otherwise. + * * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + return _sched_setscheduler(p, policy, param, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -4177,11 +3601,13 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * current context has permission. For example, this is needed in * stop_machine(): we create temporary high priority worker threads, * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + return _sched_setscheduler(p, policy, param, false); } static int @@ -4206,11 +3632,84 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return retval; } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; + + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + + return 0; + +err_size: + put_user(sizeof(*attr), &uattr->size); + return -E2BIG; +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) @@ -4226,6 +3725,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { @@ -4233,8 +3734,44 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) } /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || flags) + return -EINVAL; + + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; +} + +/** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. */ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { @@ -4261,10 +3798,13 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -4281,7 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* @@ -4296,19 +3837,103 @@ out_unlock: return retval; } +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) +{ + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; + + /* + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. + */ + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; + + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + return -EFBIG; + } + + attr->size = usize; + } + + ret = copy_to_user(uattr, attr, attr->size); + if (ret) + return -EFAULT; + + return 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0 || flags) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + attr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + if (task_has_dl_policy(p)) + __getparam_dl(p, &attr); + else if (task_has_rt_policy(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = task_nice(p); + + rcu_read_unlock(); + + retval = sched_read_attr(uattr, &attr, size); + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); return -ESRCH; } @@ -4316,6 +3941,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4325,15 +3954,39 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) goto out_unlock; + cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p)) { + const struct cpumask *span = task_rq(p)->rd->span; + + if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + retval = -EBUSY; + goto out_unlock; + } + } +#endif again: retval = set_cpus_allowed_ptr(p, new_mask); @@ -4355,7 +4008,6 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -4375,6 +4027,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4398,7 +4052,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -4411,12 +4064,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -4426,6 +4078,8 @@ out_unlock: * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4460,6 +4114,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. + * + * Return: 0. */ SYSCALL_DEFINE0(sched_yield) { @@ -4475,23 +4131,18 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); return 0; } -static inline int should_resched(void) -{ - return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} - static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -4549,8 +4200,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! */ void __sched yield(void) { @@ -4569,34 +4236,46 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns true if we indeed boosted the target task. + * Return: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; unsigned long flags; - bool yielded = 0; + int yielded = 0; local_irq_save(flags); rq = this_rq(); again: p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } + double_rq_lock(rq, p_rq); - while (task_rq(p) != p_rq) { + if (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); goto again; } if (!curr->sched_class->yield_to_task) - goto out; + goto out_unlock; if (curr->sched_class != p->sched_class) - goto out; + goto out_unlock; if (task_running(p_rq, p) || p->state) - goto out; + goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { @@ -4607,20 +4286,14 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; } -out: +out_unlock: double_rq_unlock(rq, p_rq); +out_irq: local_irq_restore(flags); - if (yielded) + if (yielded > 0) schedule(); return yielded; @@ -4666,8 +4339,9 @@ long __sched io_schedule_timeout(long timeout) * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { @@ -4678,6 +4352,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -4691,8 +4366,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { @@ -4703,6 +4379,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_RR: ret = 1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -4718,6 +4395,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) @@ -4743,7 +4423,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, goto out_unlock; rq = task_rq_lock(p, &flags); - time_slice = p->sched_class->get_rr_interval(rq, p); + time_slice = 0; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, p, &flags); rcu_read_unlock(); @@ -4761,6 +4443,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { unsigned long free = 0; + int ppid; unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; @@ -4780,10 +4463,14 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + rcu_read_lock(); + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), + task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); + print_worker_info(KERN_INFO, p); show_stack(p, NULL); } @@ -4822,7 +4509,7 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) +void init_idle_bootup_task(struct task_struct *idle) { idle->sched_class = &idle_sched_class; } @@ -4835,14 +4522,14 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - __sched_fork(idle); + __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -4862,19 +4549,21 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; + idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; + init_idle_preempt_count(idle, cpu); /* * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif @@ -4887,7 +4576,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* @@ -4930,11 +4619,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ @@ -5006,6 +4690,54 @@ fail: return ret; } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ + struct migration_arg arg = { p, target_cpu }; + int curr_cpu = task_cpu(p); + + if (curr_cpu == target_cpu) + return 0; + + if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) + return -EINVAL; + + /* TODO: This is not properly updating schedstats */ + + trace_sched_move_numa(p, curr_cpu, target_cpu); + return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ + struct rq *rq; + unsigned long flags; + bool on_rq, running; + + rq = task_rq_lock(p, &flags); + on_rq = p->on_rq; + running = task_current(rq, p); + + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->numa_preferred_nid = nid; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &flags); +} +#endif + /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then @@ -5037,35 +4769,43 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); - if (mm != &init_mm) + if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } mmdrop(mm); } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void migrate_nr_uninterruptible(struct rq *rq_src) +static void calc_load_migrate(struct rq *rq) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; } +static const struct sched_class fake_sched_class = { + .put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { + /* + * Avoid pull_{rt,dl}_task() + */ + .prio = MAX_PRIO + 1, + .sched_class = &fake_sched_class, +}; + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -5091,8 +4831,12 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); + /* + * put_prev_task() and pick_next_task() sched + * class method both need to have an up-to-date + * value of rq->clock[_task] + */ + update_rq_clock(rq); for ( ; ; ) { /* @@ -5102,7 +4846,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq); + next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5168,57 +4912,69 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler, + bool load_idx) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } } static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(13); + struct ctl_table *table = sd_alloc_ctl_entry(14); if (table == NULL) return NULL; set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[12] is terminator */ + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ return table; } -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) { struct ctl_table *entry, *table; struct sched_domain *sd; @@ -5320,7 +5076,7 @@ static void set_rq_offline(struct rq *rq) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int __cpuinit +static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; @@ -5356,9 +5112,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + case CPU_DEAD: + calc_load_migrate(rq); break; #endif } @@ -5373,16 +5130,25 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * happens before everything else. This has to be lower priority than * the notifier in the perf_event subsystem, though. */ -static struct notifier_block __cpuinitdata migration_notifier = { +static struct notifier_block migration_notifier = { .notifier_call = migration_call, .priority = CPU_PRI_MIGRATION, }; -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + +static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5391,16 +5157,34 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, } } -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { + unsigned long flags; + long cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); + set_cpu_active(cpu, false); + + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + struct dl_bw *dl_b = dl_bw_of(cpu); + bool overflow; + int cpus; + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (overflow) + return notifier_from_errno(-EBUSY); + } return NOTIFY_OK; - default: - return NOTIFY_DONE; } + + return NOTIFY_DONE; } static int __init migration_init(void) @@ -5429,15 +5213,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str) { - sched_domain_debug_enabled = 1; + sched_debug_enabled = 1; return 0; } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) @@ -5477,10 +5266,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->sgp->power) { + /* + * Even though we initialize ->capacity to something semi-sane, + * we leave capacity_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgc->capacity_orig) { printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); break; } @@ -5490,7 +5283,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + if (!(sd->flags & SD_OVERLAP) && + cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; @@ -5501,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { + printk(KERN_CONT " (cpu_capacity = %d)", + group->sgc->capacity); } group = group->next; @@ -5524,7 +5318,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_domain_debug_enabled) + if (!sched_debug_enabled) return; if (!sd) { @@ -5545,6 +5339,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) @@ -5557,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5587,8 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5603,6 +5404,8 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); + free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -5625,7 +5428,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rt yet then + * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -5654,8 +5457,14 @@ static int init_rootdomain(struct root_domain *rd) goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_dlo_mask; + + init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; @@ -5663,6 +5472,8 @@ static int init_rootdomain(struct root_domain *rd) free_rto_mask: free_cpumask_var(rd->rto_mask); +free_dlo_mask: + free_cpumask_var(rd->dlo_mask); free_online: free_cpumask_var(rd->online); free_span: @@ -5700,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void) return rd; } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; @@ -5711,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) do { tmp = sg->next; - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); kfree(sg); sg = tmp; @@ -5730,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu) if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); + kfree(sd->groups->sgc); kfree(sd->groups); } kfree(sd); @@ -5754,22 +5565,39 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). + * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; + struct sched_domain *busy_sd = NULL; int id = cpu; + int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + busy_sd = sd->parent; /* sd_busy */ + } + rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + + sd = highest_flag_domain(cpu, SD_ASYM_PACKING); + rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); } /* @@ -5792,6 +5620,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + /* + * Transfer SD_PREFER_SIBLING down in case of a + * degenerate parent; the spans match for this + * so the property transfers. + */ + if (parent->flags & SD_PREFER_SIBLING) + tmp->flags |= SD_PREFER_SIBLING; destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; @@ -5828,105 +5663,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = -1; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node != -1) - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ - nodemask_t used_nodes; - int i; - - cpumask_clear(span); - nodes_clear(used_nodes); - - cpumask_or(span, span, cpumask_of_node(node)); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - if (next_node < 0) - break; - cpumask_or(span, span, cpumask_of_node(next_node)); - } -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ - lockdep_assert_held(&sched_domains_mutex); - - sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - - return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ - return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ - -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5939,19 +5675,43 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; -#define SDTL_OVERLAP 0x01 + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - struct sd_data data; -}; +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) @@ -5971,6 +5731,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -5978,8 +5744,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -5988,10 +5752,25 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); - atomic_inc(&sg->sgp->ref); + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) + build_group_mask(sd, sg); - if (cpumask_test_cpu(cpu, sg_span)) + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity_orig = sg->sgc->capacity; + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + group_balance_cpu(sg) == cpu) groups = sg; if (!first) @@ -6021,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ } return cpu; @@ -6031,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0. * * Assumes the sched_domain tree is fully constructed */ @@ -6047,7 +5826,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) get_group(cpu, sdd, &sd->groups); atomic_inc(&sd->groups->ref); - if (cpu != cpumask_first(sched_domain_span(sd))) + if (cpu != cpumask_first(span)) return 0; lockdep_assert_held(&sched_domains_mutex); @@ -6057,14 +5836,13 @@ build_sched_groups(struct sched_domain *sd, int cpu) for_each_cpu(i, span) { struct sched_group *sg; - int group = get_group(i, sdd, &sg); - int j; + int group, j; if (cpumask_test_cpu(i, covered)) continue; - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; + group = get_group(i, sdd, &sg); + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6086,36 +5864,31 @@ build_sched_groups(struct sched_domain *sd, int cpu) } /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity. * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; - WARN_ON(!sd || !sg); + WARN_ON(!sg); do { sg->group_weight = cpumask_weight(sched_group_cpus(sg)); sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; + update_group_capacity(sd, cpu); + atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -6123,48 +5896,13 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -6241,46 +5979,399 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) +#ifdef CONFIG_NUMA +static int sched_domains_numa_levels; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; +#endif + +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) + +static struct sched_domain * +sd_init(struct sched_domain_topology_level *tl, int cpu) { - return topology_thread_cpumask(cpu); -} + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 125, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 1*SD_WAKE_AFFINE + | 0*SD_SHARE_CPUCAPACITY + | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | 0*SD_NUMA + | sd_flags + , + + .last_balance = jiffies, + .balance_interval = sd_weight, + .smt_gain = 0, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, #endif + }; + + /* + * Convert topological properties into behaviour. + */ + + if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; + + return sd; +} /* * Topology list, bottom-up. */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, - { sd_init_ALLNODES, cpu_allnodes_mask, }, + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; -static struct sched_domain_topology_level *sched_domain_topology = default_topology; +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + +static void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; + } + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_numa_distance[] array includes the actual distance + * numbers. + */ + + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * cpus of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for (k = 0; k < nr_node_ids; k++) { + if (node_distance(j, k) > sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level + 1) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, + .flags = SDTL_OVERLAP, + .numa_level = j, + SD_INIT_NAME(NUMA) + }; + } + + sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} +#else +static inline void sched_init_numa(void) +{ +} + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} +#endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) { struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; sdd->sd = alloc_percpu(struct sched_domain *); @@ -6291,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) return -ENOMEM; for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -6312,14 +6403,16 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sg) return -ENOMEM; + sg->next = sg; + *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); - if (!sgp) + if (!sgc) return -ENOMEM; - *per_cpu_ptr(sdd->sgp, j) = sgp; + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -6331,40 +6424,49 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); } free_percpu(sdd->sd); + sdd->sd = NULL; free_percpu(sdd->sg); - free_percpu(sdd->sgp); + sdd->sg = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; } } struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *child, - int cpu) + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + sd->child = child; } - sd->child = child; + set_domain_attribute(sd, attr); return sd; } @@ -6376,7 +6478,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state = sa_none; + enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; int i, ret = -ENOMEM; @@ -6390,18 +6492,15 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) { - sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } - - while (sd->child) - sd = sd->child; - - *per_cpu_ptr(d.sd, i) = sd; } /* Build the groups for the domains */ @@ -6418,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } - /* Calculate CPU power for physical packages and nodes */ + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) continue; for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { claim_allocations(i, sd); - init_sched_groups_power(i, sd); + init_sched_groups_capacity(i, sd); } } @@ -6460,7 +6559,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } @@ -6505,7 +6604,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); @@ -6597,8 +6695,9 @@ match1: ; } + n = ndoms_cur; if (doms_new == NULL) { - ndoms_cur = 0; + n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); @@ -6606,7 +6705,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur && !new_topology; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; @@ -6630,125 +6729,66 @@ match2: mutex_unlock(&sched_domains_mutex); } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - unsigned int level = 0; - - if (sscanf(buf, "%u", &level) != 1) - return -EINVAL; - - /* - * level is always be positive so don't check for - * level < POWERSAVINGS_BALANCE_NONE which is 0 - * What happens on 0 or 1 byte write, - * need to check for count as well? - */ - - if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) - return -EINVAL; - - if (smt) - sched_smt_power_savings = level; - else - sched_mc_power_savings = level; - - reinit_sched_domains(); - - return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(true); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(false); + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) @@ -6758,21 +6798,24 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - get_online_cpus(); + sched_init_numa(); + + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - init_hrtick(); /* Move init over to a non-isolated CPU */ @@ -6782,6 +6825,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); + init_sched_dl_class(); } #else void __init sched_init_smp(void) @@ -6800,10 +6844,15 @@ int in_sched_functions(unsigned long addr) } #ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ struct task_group root_task_group; +LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6840,19 +6889,21 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ } + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, + global_rt_period(), global_rt_runtime()); + #ifdef CONFIG_SMP init_defrootdomain(); #endif - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6866,12 +6917,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -6882,6 +6927,7 @@ void __init sched_init(void) rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); + init_dl_rq(&rq->dl, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6910,7 +6956,6 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif @@ -6922,7 +6967,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -6931,10 +6976,17 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->max_idle_balance_cost = sysctl_sched_migration_cost; + + INIT_LIST_HEAD(&rq->cfs_tasks); + rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -6946,10 +6998,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -6976,6 +7024,8 @@ void __init sched_init(void) /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); @@ -6995,7 +7045,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -7013,6 +7064,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); @@ -7022,13 +7080,16 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { const struct sched_class *prev_class = p->sched_class; + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + }; int old_prio = p->prio; int on_rq; on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); + __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); resched_task(rq->curr); @@ -7058,12 +7119,12 @@ void normalize_rt_tasks(void) p->se.statistics.block_start = 0; #endif - if (!rt_task(p)) { + if (!dl_task(p) && !rt_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: */ - if (TASK_NICE(p) < 0 && p->mm) + if (task_nice(p) < 0 && p->mm) set_user_nice(p, 0); continue; } @@ -7098,6 +7159,8 @@ void normalize_rt_tasks(void) * @cpu: the processor in question. * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + * + * Return: The current task for @cpu. */ struct task_struct *curr_task(int cpu) { @@ -7145,7 +7208,6 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - unsigned long flags; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -7157,6 +7219,17 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); @@ -7166,12 +7239,6 @@ struct task_group *sched_create_group(struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ @@ -7184,6 +7251,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ unsigned long flags; int i; @@ -7195,9 +7268,6 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7207,6 +7277,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7221,6 +7292,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_css_check(tsk, cpu_cgrp_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); @@ -7237,16 +7314,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -7375,7 +7442,7 @@ unlock: return err; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { u64 rt_runtime, rt_period; @@ -7387,7 +7454,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_runtime(struct task_group *tg) +static long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; @@ -7399,7 +7466,7 @@ long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) { u64 rt_runtime, rt_period; @@ -7412,7 +7479,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_period(struct task_group *tg) +static long sched_group_rt_period(struct task_group *tg) { u64 rt_period_us; @@ -7420,24 +7487,13 @@ long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period; int ret = 0; - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -7447,7 +7503,7 @@ static int sched_rt_global_constraints(void) return ret; } -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept realtime tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) @@ -7460,17 +7516,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; + int i, ret = 0; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -7482,17 +7528,91 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; + return ret; } #endif /* CONFIG_RT_GROUP_SCHED */ +static int sched_dl_global_constraints(void) +{ + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + int cpu, ret = 0; + unsigned long flags; + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (ret) + break; + } + + return ret; +} + +static void sched_dl_do_global(void) +{ + u64 new_bw = -1; + int cpu; + unsigned long flags; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + dl_b->bw = new_bw; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + } +} + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; int old_period, old_runtime; static DEFINE_MUTEX(mutex); + int ret; mutex_lock(&mutex); old_period = sysctl_sched_rt_period; @@ -7501,41 +7621,68 @@ int sched_rt_handler(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } + if (ret) + goto undo; + + ret = sched_dl_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; } mutex_unlock(&mutex); return ret; } +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_CGROUP_SCHED -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css ? container_of(css, struct task_group, css) : NULL; } static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7543,22 +7690,38 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + sched_offline_group(tg); +} + +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7569,18 +7732,18 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7594,15 +7757,16 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7646,7 +7810,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + /* + * If we need to toggle cfs_bandwidth_used, off->on must occur + * before making related changes, and on->off must occur afterwards + */ + if (runtime_enabled && !runtime_was_enabled) + cfs_bandwidth_usage_inc(); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -7655,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); @@ -7672,6 +7840,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } + if (runtime_was_enabled && !runtime_enabled) + cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); @@ -7724,26 +7894,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7824,15 +7996,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); return 0; } @@ -7840,26 +8011,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ @@ -7884,7 +8057,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_map = cpu_stats_show, + .seq_show = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -7899,247 +8072,25 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif + { } /* terminate */ }; -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} - -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, +struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .populate = cpu_cgroup_populate, - .subsys_id = cpu_cgroup_subsys_id, + .base_cftypes = cpu_files, .early_init = 1, }; #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, -}; - -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +void dump_cpu_task(int cpu) { - return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); } - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, - .populate = cpuacct_populate, - .subsys_id = cpuacct_subsys_id, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 00000000000..9cf350c94ec --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,283 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuacct, css) : NULL; +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return css_ca(task_css(tsk, cpuacct_cgrp_id)); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + return css_ca(ca->css.parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cpuacct *ca; + + if (!parent_css) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup_subsys_state *css) +{ + struct cpuacct *ca = css_ca(css); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuacct *ca = css_ca(css); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) +{ + struct cpuacct *ca = css_ca(css); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct seq_file *sf, void *v) +{ + struct cpuacct *ca = css_ca(seq_css(sf)); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .seq_show = cpuacct_percpu_seq_show, + }, + { + .name = "stat", + .seq_show = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + while (true) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; + } + + rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + rcu_read_lock(); + ca = task_ca(p); + while (ca != &root_cpuacct) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = parent_ca(ca); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_cgrp_subsys = { + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .base_cftypes = files, + .early_init = 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 00000000000..ed605624a5e --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 00000000000..bd95963dae8 --- /dev/null +++ b/kernel/sched/cpudeadline.c @@ -0,0 +1,229 @@ +/* + * kernel/sched/cpudl.c + * + * Global CPU deadline management + * + * Author: Juri Lelli <j.lelli@sssup.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include "cpudeadline.h" + +static inline int parent(int i) +{ + return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ + return (i << 1) + 1; +} + +static inline int right_child(int i) +{ + return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static void cpudl_exchange(struct cpudl *cp, int a, int b) +{ + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; + + /* adapted from lib/prio_heap.c */ + while(1) { + l = left_child(idx); + r = right_child(idx); + largest = idx; + + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) + largest = l; + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) + largest = r; + if (largest == idx) + break; + + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); + idx = largest; + } +} + +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); + + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ + return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask) +{ + int best_cpu = -1; + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && cpumask_and(later_mask, cp->free_cpus, + &p->cpus_allowed) && cpumask_and(later_mask, + later_mask, cpu_active_mask)) { + best_cpu = cpumask_any(later_mask); + goto out; + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + best_cpu = cpudl_maximum(cp); + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); + } + +out: + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + + return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ + int old_idx, new_cpu; + unsigned long flags; + + WARN_ON(!cpu_present(cpu)); + + raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->elements[cpu].idx; + if (!is_valid) { + /* remove item */ + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + goto out; + } + new_cpu = cp->elements[cp->size - 1].cpu; + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; + cp->elements[old_idx].cpu = new_cpu; + cp->size--; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } + cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + + goto out; + } + + if (old_idx == IDX_INVALID) { + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->elements[cpu].idx = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); + cpumask_clear_cpu(cpu, cp->free_cpus); + } else { + cpudl_change_key(cp, old_idx, dl); + } + +out: + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + raw_spin_lock_init(&cp->lock); + cp->size = 0; + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); + return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + + cpumask_setall(cp->free_cpus); + + return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ + free_cpumask_var(cp->free_cpus); + kfree(cp->elements); +} diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 00000000000..538c9796ad4 --- /dev/null +++ b/kernel/sched/cpudeadline.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include <linux/sched.h> + +#define IDX_INVALID -1 + +struct cpudl_item { + u64 dl; + int cpu; + int idx; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d72586fdf66..981fcd7dc39 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -28,6 +28,9 @@ */ #include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/slab.h> #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -60,16 +63,15 @@ static int convert_prio(int prio) * any discrepancies created by racing against the uncertainty of the current * priority configuration. * - * Returns: (int)bool - CPUs were found + * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) { - int idx = 0; - int task_pri = convert_prio(p->prio); + int idx = 0; + int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; @@ -137,9 +139,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, */ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - int do_mb = 0; + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + int do_mb = 0; newpri = convert_prio(newpri); @@ -163,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * do a write memory barrier, and then update the count, to * make sure the vector is visible when count is set. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&(vec)->count); do_mb = 1; } @@ -183,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * the new priority vec. */ if (do_mb) - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * When removing from the vector, we decrement the counter first * do a memory barrier and then clear the mask. */ atomic_dec(&(vec)->count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); cpumask_clear_cpu(cpu, vec->mask); } @@ -201,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * cpupri_init - initialize the cpupri structure * @cp: The cpupri context * - * Returns: -ENOMEM if memory fails. + * Return: -ENOMEM on memory allocation failure. */ int cpupri_init(struct cpupri *cp) { @@ -217,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -235,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d75617349..6b033347fdf 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 00000000000..72fdf06ef86 --- /dev/null +++ b/kernel/sched/cputime.c @@ -0,0 +1,838 @@ +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/kernel_stat.h> +#include <linux/static_key.h> +#include <linux/context_tracking.h> +#include "sched.h" + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in vtime_account, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/vtime_account on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +DEFINE_PER_CPU(u64, cpu_hardirq_time); +DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +DEFINE_PER_CPU(seqcount_t, irq_time_seq); +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void irqtime_account_irq(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(irqtime_account_irq); + +static int irqtime_account_hi_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __this_cpu_add(kernel_cpustat.cpustat[index], tmp); + + cpuacct_account_field(p, index, tmp); +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for user time used */ + acct_account_cputime(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += cputime; + + /* Add guest time to cpustat. */ + if (task_nice(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) +{ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for system time used */ + acct_account_cputime(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + int index; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + index = CPUTIME_IRQ; + else if (in_serving_softirq()) + index = CPUTIME_SOFTIRQ; + else + index = CPUTIME_SYSTEM; + + __account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal; + cputime_t steal_ct; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + + account_steal_time(steal_ct); + return steal_ct; + } +#endif + return false; +} + +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq, int ticks) +{ + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; + u64 *cpustat = kcpustat_this_cpu->cpustat; + + if (steal_account_process_tick()) + return; + + cputime *= ticks; + scaled *= ticks; + + if (irqtime_account_hi_update()) { + cpustat[CPUTIME_IRQ] += cputime; + } else if (irqtime_account_si_update()) { + cpustat[CPUTIME_SOFTIRQ] += cputime; + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); + } else if (user_tick) { + account_user_time(p, cputime, scaled); + } else if (p == rq->idle) { + account_idle_time(cputime); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime, scaled); + } else { + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + struct rq *rq = this_rq(); + + irqtime_account_process_tick(current, 0, rq, ticks); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq, int nr_ticks) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_common_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_common_account_irq_enter(struct task_struct *tsk) +{ + if (!in_interrupt()) { + /* + * If we interrupted user, context_tracking_in_user() + * is 1 because the context tracking don't hook + * on irq entry/exit. This way we know if + * we need to flush user time on kernel entry. + */ + if (context_tracking_in_user()) { + vtime_account_user(tsk); + return; + } + + if (is_idle_task(tsk)) { + vtime_account_idle(tsk); + return; + } + } + vtime_account_system(tsk); +} +EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (vtime_accounting_enabled()) + return; + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq, 1); + return; + } + + if (steal_account_process_tick()) + return; + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +/* + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * loosing precision when the numbers are big. + */ +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) +{ + u64 scaled; + + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) + swap(rtime, stime); + + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; + + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; + + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; + + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; + +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; + } + + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); + return (__force cputime_t) scaled; +} + +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, stime, utime; + + /* + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. + */ + rtime = nsecs_to_cputime(curr->sum_exec_runtime); + + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + + stime = curr->stime; + utime = curr->utime; + + if (utime == 0) { + stime = rtime; + } else if (stime == 0) { + utime = rtime; + } else { + cputime_t total = stime + utime; + + stime = scale_stime((__force u64)stime, + (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } + + /* + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. + */ + prev->stime = max(prev->stime, stime); + prev->utime = max(prev->utime, utime); + +out: + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + task_cputime(p, &cputime.utime, &cputime.stime); + cputime_adjust(&cputime, &p->prev_cputime, ut, st); +} + +/* + * Must be called with siglock held. + */ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); +} +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static unsigned long long vtime_delta(struct task_struct *tsk) +{ + unsigned long long clock; + + clock = local_clock(); + if (clock < tsk->vtime_snap) + return 0; + + return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) +{ + unsigned long long delta = vtime_delta(tsk); + + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + tsk->vtime_snap += delta; + + /* CHECKME: always safe to convert nsecs to cputime? */ + return nsecs_to_cputime(delta); +} + +static void __vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_system(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_gen_account_irq_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + if (context_tracking_in_user()) + tsk->vtime_snap_whence = VTIME_USER; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_user(struct task_struct *tsk) +{ + cputime_t delta_cpu; + + write_seqlock(&tsk->vtime_seqlock); + delta_cpu = get_vtime_delta(tsk); + tsk->vtime_snap_whence = VTIME_SYS; + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ + /* + * The flags must be updated under the lock with + * the vtime_snap flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags |= PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} +EXPORT_SYMBOL_GPL(vtime_guest_enter); + +void vtime_guest_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} +EXPORT_SYMBOL_GPL(vtime_guest_exit); + +void vtime_account_idle(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_idle_time(delta_cpu); +} + +void arch_vtime_task_switch(struct task_struct *prev) +{ + write_seqlock(&prev->vtime_seqlock); + prev->vtime_snap_whence = VTIME_SLEEPING; + write_sequnlock(&prev->vtime_seqlock); + + write_seqlock(¤t->vtime_seqlock); + current->vtime_snap_whence = VTIME_SYS; + current->vtime_snap = sched_clock_cpu(smp_processor_id()); + write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t, int cpu) +{ + unsigned long flags; + + write_seqlock_irqsave(&t->vtime_seqlock, flags); + t->vtime_snap_whence = VTIME_SYS; + t->vtime_snap = sched_clock_cpu(cpu); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ + unsigned int seq; + cputime_t gtime; + + do { + seq = read_seqbegin(&t->vtime_seqlock); + + gtime = t->gtime; + if (t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqretry(&t->vtime_seqlock, seq)); + + return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, + cputime_t *u_dst, cputime_t *s_dst, + cputime_t *u_src, cputime_t *s_src, + cputime_t *udelta, cputime_t *sdelta) +{ + unsigned int seq; + unsigned long long delta; + + do { + *udelta = 0; + *sdelta = 0; + + seq = read_seqbegin(&t->vtime_seqlock); + + if (u_dst) + *u_dst = *u_src; + if (s_dst) + *s_dst = *s_src; + + /* Task is sleeping, nothing to add */ + if (t->vtime_snap_whence == VTIME_SLEEPING || + is_idle_task(t)) + continue; + + delta = vtime_delta(t); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { + *udelta = delta; + } else { + if (t->vtime_snap_whence == VTIME_SYS) + *sdelta = delta; + } + } while (read_seqretry(&t->vtime_seqlock, seq)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utime, stime, &t->utime, + &t->stime, &udelta, &sdelta); + if (utime) + *utime += udelta; + if (stime) + *stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utimescaled, stimescaled, + &t->utimescaled, &t->stimescaled, &udelta, &sdelta); + if (utimescaled) + *utimescaled += cputime_to_scaled(udelta); + if (stimescaled) + *stimescaled += cputime_to_scaled(sdelta); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 00000000000..fc4f98b1258 --- /dev/null +++ b/kernel/sched/deadline.c @@ -0,0 +1,1676 @@ +/* + * Deadline Scheduling Class (SCHED_DEADLINE) + * + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). + * + * Tasks that periodically executes their instances for less than their + * runtime won't miss any of their deadlines. + * Tasks that are not periodic or sporadic or that tries to execute more + * than their reserved bandwidth will be slowed down (and may potentially + * miss some of their deadlines), and won't affect any other task. + * + * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, + * Juri Lelli <juri.lelli@gmail.com>, + * Michael Trimarchi <michael@amarulasolutions.com>, + * Fabio Checconi <fchecconi@gmail.com> + */ +#include "sched.h" + +#include <linux/slab.h> + +struct dl_bandwidth def_dl_bandwidth; + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ + return container_of(dl_se, struct task_struct, dl); +} + +static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) +{ + return container_of(dl_rq, struct rq, dl); +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + + return &rq->dl; +} + +static inline int on_dl_rq(struct sched_dl_entity *dl_se) +{ + return !RB_EMPTY_NODE(&dl_se->rb_node); +} + +static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +{ + struct sched_dl_entity *dl_se = &p->dl; + + return dl_rq->rb_leftmost == &dl_se->rb_node; +} + +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +{ + raw_spin_lock_init(&dl_b->dl_runtime_lock); + dl_b->dl_period = period; + dl_b->dl_runtime = runtime; +} + +void init_dl_bw(struct dl_bw *dl_b) +{ + raw_spin_lock_init(&dl_b->lock); + raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); + if (global_rt_runtime() == RUNTIME_INF) + dl_b->bw = -1; + else + dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); + raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); + dl_b->total_bw = 0; +} + +void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +{ + dl_rq->rb_root = RB_ROOT; + +#ifdef CONFIG_SMP + /* zero means no -deadline tasks */ + dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; + + dl_rq->dl_nr_migratory = 0; + dl_rq->overloaded = 0; + dl_rq->pushable_dl_tasks_root = RB_ROOT; +#else + init_dl_bw(&dl_rq->dl_bw); +#endif +} + +#ifdef CONFIG_SMP + +static inline int dl_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->dlo_count); +} + +static inline void dl_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); + /* + * Must be visible before the overload count is + * set (as in sched_rt.c). + * + * Matched by the barrier in pull_dl_task(). + */ + smp_wmb(); + atomic_inc(&rq->rd->dlo_count); +} + +static inline void dl_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + atomic_dec(&rq->rd->dlo_count); + cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); +} + +static void update_dl_migration(struct dl_rq *dl_rq) +{ + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { + if (!dl_rq->overloaded) { + dl_set_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 1; + } + } else if (dl_rq->overloaded) { + dl_clear_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 0; + } +} + +static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory++; + + update_dl_migration(dl_rq); +} + +static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory--; + + update_dl_migration(dl_rq); +} + +/* + * The list of pushable -deadline task is not a plist, like in + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. + */ +static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node *parent = NULL; + struct task_struct *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct task_struct, + pushable_dl_tasks); + if (dl_entity_preempt(&p->dl, &entry->dl)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + + rb_link_node(&p->pushable_dl_tasks, parent, link); + rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +} + +static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + + if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) + return; + + if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + struct rb_node *next_node; + + next_node = rb_next(&p->pushable_dl_tasks); + dl_rq->pushable_dl_tasks_leftmost = next_node; + } + + rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +} + +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); +} + +static int push_dl_task(struct rq *rq); + +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ + rq->post_schedule = has_pushable_dl_tasks(rq); +} + +#else + +static inline +void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline +void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} +#endif /* CONFIG_SMP */ + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags); + +/* + * We are being explicitly informed that a new instance is starting, + * and this means that: + * - the absolute deadline of the entity has to be placed at + * current time + relative deadline; + * - the runtime of the entity has to be set to the maximum value. + * + * The capability of specifying such event is useful whenever a -deadline + * entity wants to (try to!) synchronize its behaviour with the scheduler's + * one, and to (try to!) reconcile itself with its own scheduling + * parameters. + */ +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + + /* + * We use the regular wall clock time to set deadlines in the + * future; in fact, we must consider execution overheads (time + * spent on hardirq context, etc.). + */ + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + dl_se->dl_new = 0; +} + +/* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus + * exhausting its runtime. + * + * Here we are interested in making runtime overrun possible, but we do + * not want a entity which is misbehaving to affect the scheduling of all + * other entities. + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) + * is used, in order to confine each entity within its own bandwidth. + * + * This function deals exactly with that, and ensures that when the runtime + * of a entity is replenished, its deadline is also postponed. That ensures + * the overrunning entity can't interfere with other entity in the system and + * can't make them miss their deadlines. Reasons why this kind of overruns + * could happen are, typically, a entity voluntarily trying to overcome its + * runtime, or it just underestimated it during sched_setscheduler_ex(). + */ +static void replenish_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + BUG_ON(pi_se->dl_runtime <= 0); + + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. + */ + if (dl_se->dl_deadline == 0) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } + + /* + * We keep moving the deadline away until we get some + * available runtime for the entity. This ensures correct + * handling of situations where the runtime overrun is + * arbitrary large. + */ + while (dl_se->runtime <= 0) { + dl_se->deadline += pi_se->dl_period; + dl_se->runtime += pi_se->dl_runtime; + } + + /* + * At this point, the deadline really should be "in + * the future" with respect to rq->clock. If it's + * not, we are, for some reason, lagging too much! + * Anyway, after having warn userspace abut that, + * we still try to keep the things running by + * resetting the deadline and the budget of the + * entity. + */ + if (dl_time_before(dl_se->deadline, rq_clock(rq))) { + printk_deferred_once("sched: DL replenish lagged to much\n"); + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * Here we check if --at time t-- an entity (which is probably being + * [re]activated or, in general, enqueued) can use its remaining runtime + * and its current deadline _without_ exceeding the bandwidth it is + * assigned (function returns true if it can't). We are in fact applying + * one of the CBS rules: when a task wakes up, if the residual runtime + * over residual deadline fits within the allocated bandwidth, then we + * can keep the current (absolute) deadline and residual budget without + * disrupting the schedulability of the system. Otherwise, we should + * refill the runtime and set the deadline a period in the future, + * because keeping the current (absolute) deadline of the task would + * result in breaking guarantees promised to other tasks (refer to + * Documentation/scheduler/sched-deadline.txt for more informations). + * + * This function returns true if: + * + * runtime / (deadline - t) > dl_runtime / dl_period , + * + * IOW we can't recycle current parameters. + * + * Notice that the bandwidth check is done against the period. For + * task with deadline equal to period this is the same of using + * dl_deadline instead of dl_period in the equation above. + */ +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, u64 t) +{ + u64 left, right; + + /* + * left and right are the two sides of the equation above, + * after a bit of shuffling to use multiplications instead + * of divisions. + * + * Note that none of the time values involved in the two + * multiplications are absolute: dl_deadline and dl_runtime + * are the relative deadline and the maximum runtime of each + * instance, runtime is the runtime left for the last instance + * and (deadline - t), since t is rq->clock, is the time left + * to the (absolute) deadline. Even if overflowing the u64 type + * is very unlikely to occur in both cases, here we scale down + * as we want to avoid that risk at all. Scaling down by 10 + * means that we reduce granularity to 1us. We are fine with it, + * since this is only a true/false check and, anyway, thinking + * of anything below microseconds resolution is actually fiction + * (but still we want to give the user that illusion >;). + */ + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + right = ((dl_se->deadline - t) >> DL_SCALE) * + (pi_se->dl_runtime >> DL_SCALE); + + return dl_time_before(right, left); +} + +/* + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. + * + * The policy here is that we update the deadline of the entity only if: + * - the current deadline is in the past, + * - using the remaining runtime with the current deadline would make + * the entity exceed its bandwidth. + */ +static void update_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * The arrival of a new instance needs special treatment, i.e., + * the actual scheduling parameters have to be "renewed". + */ + if (dl_se->dl_new) { + setup_new_dl_entity(dl_se, pi_se); + return; + } + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) || + dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * If the entity depleted all its runtime, and if we want it to sleep + * while waiting for some new execution time to become available, we + * set the bandwidth enforcement timer to the replenishment instant + * and try to activate it. + * + * Notice that it is important for the caller to know if the timer + * actually started or not (i.e., the replenishment instant is in + * the future or in the past). + */ +static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + ktime_t now, act; + ktime_t soft, hard; + unsigned long range; + s64 delta; + + if (boosted) + return 0; + /* + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. + */ + act = ns_to_ktime(dl_se->deadline); + now = hrtimer_cb_get_time(&dl_se->dl_timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); + + /* + * If the expiry time already passed, e.g., because the value + * chosen as the deadline is too small, don't even try to + * start the timer in the past! + */ + if (ktime_us_delta(act, now) < 0) + return 0; + + hrtimer_set_expires(&dl_se->dl_timer, act); + + soft = hrtimer_get_softexpires(&dl_se->dl_timer); + hard = hrtimer_get_expires(&dl_se->dl_timer); + range = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&dl_se->dl_timer, soft, + range, HRTIMER_MODE_ABS, 0); + + return hrtimer_active(&dl_se->dl_timer); +} + +/* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running + * means the task is throttled and needs a runtime replenishment. + * + * However, what we actually do depends on the fact the task is active, + * (it is on its rq) or has been removed from there by a call to + * dequeue_task_dl(). In the former case we must issue the runtime + * replenishment and add the task back to the dl_rq; in the latter, we just + * do nothing but clearing dl_throttled, so that runtime and deadline + * updating (and the queueing back to dl_rq) will be done by the + * next call to enqueue_task_dl(). + */ +static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + dl_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq; +again: + rq = task_rq(p); + raw_spin_lock(&rq->lock); + + if (rq != task_rq(p)) { + /* Task was moved, retrying. */ + raw_spin_unlock(&rq->lock); + goto again; + } + + /* + * We need to take care of a possible races here. In fact, the + * task might have changed its scheduling policy to something + * different from SCHED_DEADLINE or changed its reservation + * parameters (through sched_setattr()). + */ + if (!dl_task(p) || dl_se->dl_new) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + if (p->on_rq) { + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_task(rq->curr); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif + } +unlock: + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +void init_dl_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->dl_timer; + + if (hrtimer_active(timer)) { + hrtimer_try_to_cancel(timer); + return; + } + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = dl_task_timer; +} + +static +int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +{ + int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); + int rorun = dl_se->runtime <= 0; + + if (!rorun && !dmiss) + return 0; + + /* + * If we are beyond our current deadline and we are still + * executing, then we have already used some of the runtime of + * the next instance. Thus, if we do not account that, we are + * stealing bandwidth from the system at each deadline miss! + */ + if (dmiss) { + dl_se->runtime = rorun ? dl_se->runtime : 0; + dl_se->runtime -= rq_clock(rq) - dl_se->deadline; + } + + return 1; +} + +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + u64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec <= 0)) + return; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, delta_exec); + + sched_rt_avg_update(rq, delta_exec); + + dl_se->runtime -= delta_exec; + if (dl_runtime_exceeded(rq, dl_se)) { + __dequeue_task_dl(rq, curr, 0); + if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) + dl_se->dl_throttled = 1; + else + enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + + if (!is_leftmost(curr, &rq->dl)) + resched_task(curr); + } + + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks + * would be able to exceed the shared quota. + * + * Account to the root rt group for now. + * + * The solution we're working towards is having the RT groups scheduled + * using deadline servers -- however there's a few nasties to figure + * out before that can happen. + */ + if (rt_bandwidth_enabled()) { + struct rt_rq *rt_rq = &rq->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + /* + * We'll let actual RT tasks worry about the overflow here, we + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. + */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } +} + +#ifdef CONFIG_SMP + +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); + +static inline u64 next_deadline(struct rq *rq) +{ + struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); + + if (next && dl_prio(next->prio)) + return next->dl.deadline; + else + return 0; +} + +static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + if (dl_rq->earliest_dl.curr == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + /* + * If the dl_rq had no -deadline tasks, or if the new task + * has shorter deadline than the current one on dl_rq, we + * know that the previous earliest becomes our next earliest, + * as the new task becomes the earliest itself. + */ + dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; + dl_rq->earliest_dl.curr = deadline; + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + } else if (dl_rq->earliest_dl.next == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.next)) { + /* + * On the other hand, if the new -deadline task has a + * a later deadline than the earliest one on dl_rq, but + * it is earlier than the next (if any), we must + * recompute the next-earliest. + */ + dl_rq->earliest_dl.next = next_deadline(rq); + } +} + +static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * Since we may have removed our earliest (and/or next earliest) + * task we must recompute them. + */ + if (!dl_rq->dl_nr_running) { + dl_rq->earliest_dl.curr = 0; + dl_rq->earliest_dl.next = 0; + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + } else { + struct rb_node *leftmost = dl_rq->rb_leftmost; + struct sched_dl_entity *entry; + + entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); + dl_rq->earliest_dl.curr = entry->deadline; + dl_rq->earliest_dl.next = next_deadline(rq); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + } +} + +#else + +static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} +static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} + +#endif /* CONFIG_SMP */ + +static inline +void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + u64 deadline = dl_se->deadline; + + WARN_ON(!dl_prio(prio)); + dl_rq->dl_nr_running++; + add_nr_running(rq_of_dl_rq(dl_rq), 1); + + inc_dl_deadline(dl_rq, deadline); + inc_dl_migration(dl_se, dl_rq); +} + +static inline +void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + + WARN_ON(!dl_prio(prio)); + WARN_ON(!dl_rq->dl_nr_running); + dl_rq->dl_nr_running--; + sub_nr_running(rq_of_dl_rq(dl_rq), 1); + + dec_dl_deadline(dl_rq, dl_se->deadline); + dec_dl_migration(dl_se, dl_rq); +} + +static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node *parent = NULL; + struct sched_dl_entity *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_dl_entity, rb_node); + if (dl_time_before(dl_se->deadline, entry->deadline)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->rb_leftmost = &dl_se->rb_node; + + rb_link_node(&dl_se->rb_node, parent, link); + rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + + inc_dl_tasks(dl_se, dl_rq); +} + +static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + if (RB_EMPTY_NODE(&dl_se->rb_node)) + return; + + if (dl_rq->rb_leftmost == &dl_se->rb_node) { + struct rb_node *next_node; + + next_node = rb_next(&dl_se->rb_node); + dl_rq->rb_leftmost = next_node; + } + + rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + RB_CLEAR_NODE(&dl_se->rb_node); + + dec_dl_tasks(dl_se, dl_rq); +} + +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, int flags) +{ + BUG_ON(on_dl_rq(dl_se)); + + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); + else + update_dl_entity(dl_se, pi_se); + + __enqueue_dl_entity(dl_se); +} + +static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + __dequeue_dl_entity(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + struct sched_dl_entity *pi_se = &p->dl; + + /* + * Use the scheduling parameters of the top pi-waiter + * task if we have one and its (relative) deadline is + * smaller than our one... OTW we keep our runtime and + * deadline. + */ + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + pi_se = &pi_task->dl; + + /* + * If p is throttled, we do nothing. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + */ + if (p->dl.dl_throttled) + return; + + enqueue_dl_entity(&p->dl, pi_se, flags); + + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + dequeue_dl_entity(&p->dl); + dequeue_pushable_dl_task(rq, p); +} + +static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + __dequeue_task_dl(rq, p, flags); +} + +/* + * Yield task semantic for -deadline tasks is: + * + * get off from the CPU until our next instance, with + * a new runtime. This is of little use now, since we + * don't have a bandwidth reclaiming mechanism. Anyway, + * bandwidth reclaiming is planned for the future, and + * yield_task_dl will indicate that some spare budget + * is available for other task instances to use it. + */ +static void yield_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + /* + * We make the task go to sleep until its current deadline by + * forcing its runtime to zero. This way, update_curr_dl() stops + * it and the bandwidth timer will wake it up and will give it + * new scheduling parameters (thanks to dl_yielded=1). + */ + if (p->dl.runtime > 0) { + rq->curr->dl.dl_yielded = 1; + p->dl.runtime = 0; + } + update_curr_dl(rq); +} + +#ifdef CONFIG_SMP + +static int find_later_rq(struct task_struct *task); + +static int +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If we are dealing with a -deadline task, we must + * decide where to wake it up. + * If it has a later deadline and the current task + * on this rq can't move (provided the waking task + * can!) we prefer to send it somewhere else. On the + * other hand, if it has a shorter deadline, we + * try to make it stay here, it might be important. + */ + if (unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + (p->nr_cpus_allowed > 1)) { + int target = find_later_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + +out: + return cpu; +} + +static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) +{ + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + return; + + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ + if (p->nr_cpus_allowed != 1 && + cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + return; + + resched_task(rq->curr); +} + +static int pull_dl_task(struct rq *this_rq); + +#endif /* CONFIG_SMP */ + +/* + * Only called when both the current and waking task are -deadline + * tasks. + */ +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags) +{ + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * In the unlikely case current and p have the same deadline + * let us try to decide what's the best thing to do... + */ + if ((p->dl.deadline == rq->curr->dl.deadline) && + !test_tsk_need_resched(rq->curr)) + check_preempt_equal_dl(rq, p); +#endif /* CONFIG_SMP */ +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ + s64 delta = p->dl.dl_runtime - p->dl.runtime; + + if (delta > 10000) + hrtick_start(rq, p->dl.runtime); +} +#endif + +static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, + struct dl_rq *dl_rq) +{ + struct rb_node *left = dl_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_dl_entity, rb_node); +} + +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +{ + struct sched_dl_entity *dl_se; + struct task_struct *p; + struct dl_rq *dl_rq; + + dl_rq = &rq->dl; + + if (need_pull_dl_task(rq, prev)) { + pull_dl_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a stop task can slip in, in which case we need to + * re-start task selection. + */ + if (rq->stop && rq->stop->on_rq) + return RETRY_TASK; + } + + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); + + if (unlikely(!dl_rq->dl_nr_running)) + return NULL; + + put_prev_task(rq, prev); + + dl_se = pick_next_dl_entity(rq, dl_rq); + BUG_ON(!dl_se); + + p = dl_task_of(dl_se); + p->se.exec_start = rq_clock_task(rq); + + /* Running task will never be pushed. */ + dequeue_pushable_dl_task(rq, p); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); +#endif + + set_post_schedule(rq); + + return p; +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +{ + update_curr_dl(rq); + + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_dl(rq); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + start_hrtick_dl(rq, p); +#endif +} + +static void task_fork_dl(struct task_struct *p) +{ + /* + * SCHED_DEADLINE tasks cannot fork and this is achieved through + * sched_fork() + */ +} + +static void task_dead_dl(struct task_struct *p) +{ + struct hrtimer *timer = &p->dl.dl_timer; + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + /* + * Since we are TASK_DEAD we won't slip out of the domain! + */ + raw_spin_lock_irq(&dl_b->lock); + dl_b->total_bw -= p->dl.dl_bw; + raw_spin_unlock_irq(&dl_b->lock); + + hrtimer_cancel(timer); +} + +static void set_curr_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define DL_MAX_TRIES 3 + +static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + + return 0; +} + +/* Returns the second earliest -deadline task, NULL otherwise */ +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.rb_leftmost; + struct sched_dl_entity *dl_se; + struct task_struct *p = NULL; + +next_node: + next_node = rb_next(next_node); + if (next_node) { + dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); + p = dl_task_of(dl_se); + + if (pick_dl_task(rq, p, cpu)) + return p; + + goto next_node; + } + + return NULL; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); + +static int find_later_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + int this_cpu = smp_processor_id(); + int best_cpu, cpu = task_cpu(task); + + /* Make sure the mask is initialized first */ + if (unlikely(!later_mask)) + return -1; + + if (task->nr_cpus_allowed == 1) + return -1; + + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, + task, later_mask); + if (best_cpu == -1) + return -1; + + /* + * If we are here, some target has been found, + * the most suitable of which is cached in best_cpu. + * This is, among the runqueues where the current tasks + * have later deadlines than the task's one, the rq + * with the latest possible one. + * + * Now we check how well this matches with task's + * affinity and system topology. + * + * The last cpu where the task run is our first + * guess, since it is most likely cache-hot there. + */ + if (cpumask_test_cpu(cpu, later_mask)) + return cpu; + /* + * Check if this_cpu is to be skipped (i.e., it is + * not in the mask) or not. + */ + if (!cpumask_test_cpu(this_cpu, later_mask)) + this_cpu = -1; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + + /* + * If possible, preempting this_cpu is + * cheaper than migrating. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + /* + * Last chance: if best_cpu is valid and is + * in the mask, that becomes our choice. + */ + if (best_cpu < nr_cpu_ids && + cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * At this point, all our guesses failed, we just return + * 'something', and let the caller sort the things out. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + + return -1; +} + +/* Locks the rq it finds */ +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *later_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < DL_MAX_TRIES; tries++) { + cpu = find_later_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + later_rq = cpu_rq(cpu); + + /* Retry if something changed. */ + if (double_lock_balance(rq, later_rq)) { + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(later_rq->cpu, + &task->cpus_allowed) || + task_running(rq, task) || !task->on_rq)) { + double_unlock_balance(rq, later_rq); + later_rq = NULL; + break; + } + } + + /* + * If the rq we found has no -deadline task, or + * its earliest one has a later deadline than our + * task, the rq is a good one. + */ + if (!later_rq->dl.dl_nr_running || + dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) + break; + + /* Otherwise we try again. */ + double_unlock_balance(rq, later_rq); + later_rq = NULL; + } + + return later_rq; +} + +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!p->on_rq); + BUG_ON(!dl_task(p)); + + return p; +} + +/* + * See if the non running -deadline tasks on this rq + * can be sent to some other CPU where they can preempt + * and start executing. + */ +static int push_dl_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *later_rq; + + if (!rq->dl.overloaded) + return 0; + + next_task = pick_next_pushable_dl_task(rq); + if (!next_task) + return 0; + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * If next_task preempts rq->curr, and rq->curr + * can move away, it makes sense to just reschedule + * without going further in pushing next_task. + */ + if (dl_task(rq->curr) && + dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + rq->curr->nr_cpus_allowed > 1) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* Will lock the rq it'll find */ + later_rq = find_lock_later_rq(next_task, rq); + if (!later_rq) { + struct task_struct *task; + + /* + * We must check all this again, since + * find_lock_later_rq releases rq->lock and it is + * then possible that next_task has migrated. + */ + task = pick_next_pushable_dl_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * The task is still there. We don't try + * again, some other cpu will pull it when ready. + */ + dequeue_pushable_dl_task(rq, next_task); + goto out; + } + + if (!task) + /* No more tasks */ + goto out; + + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, later_rq->cpu); + activate_task(later_rq, next_task, 0); + + resched_task(later_rq->curr); + + double_unlock_balance(rq, later_rq); + +out: + put_task_struct(next_task); + + return 1; +} + +static void push_dl_tasks(struct rq *rq) +{ + /* Terminates as it moves a -deadline task */ + while (push_dl_task(rq)) + ; +} + +static int pull_dl_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + u64 dmin = LONG_MAX; + + if (likely(!dl_overloaded(this_rq))) + return 0; + + /* + * Match the barrier from dl_set_overloaded; this guarantees that if we + * see overloaded we must also see the dlo_mask bit. + */ + smp_rmb(); + + for_each_cpu(cpu, this_rq->rd->dlo_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * It looks racy, abd it is! However, as in sched_rt.c, + * we are fine with this. + */ + if (this_rq->dl.dl_nr_running && + dl_time_before(this_rq->dl.earliest_dl.curr, + src_rq->dl.earliest_dl.next)) + continue; + + /* Might drop this_rq->lock */ + double_lock_balance(this_rq, src_rq); + + /* + * If there are no more pullable tasks on the + * rq, we're done with it. + */ + if (src_rq->dl.dl_nr_running <= 1) + goto skip; + + p = pick_next_earliest_dl_task(src_rq, this_cpu); + + /* + * We found a task to be pulled if: + * - it preempts our current (if there's one), + * - it will preempt the last one we pulled (if any). + */ + if (p && dl_time_before(p->dl.deadline, dmin) && + (!this_rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + this_rq->dl.earliest_dl.curr))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * Then we pull iff p has actually an earlier + * deadline than the current task of its runqueue. + */ + if (dl_time_before(p->dl.deadline, + src_rq->curr->dl.deadline)) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + + /* Is there any other task even earlier? */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +static void post_schedule_dl(struct rq *rq) +{ + push_dl_tasks(rq); +} + +/* + * Since the task is not running and a reschedule is not going to happen + * anytime soon on its runqueue, we try pushing it away now. + */ +static void task_woken_dl(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_dl_tasks(rq) && + p->nr_cpus_allowed > 1 && + dl_task(rq->curr) && + (rq->curr->nr_cpus_allowed < 2 || + dl_entity_preempt(&rq->curr->dl, &p->dl))) { + push_dl_tasks(rq); + } +} + +static void set_cpus_allowed_dl(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq; + int weight; + + BUG_ON(!dl_task(p)); + + /* + * Update only if the task is actually running (i.e., + * it is on the rq AND it is not throttled). + */ + if (!on_dl_rq(&p->dl)) + return; + + weight = cpumask_weight(new_mask); + + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; + + rq = task_rq(p); + + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_dl_task(rq, p); + BUG_ON(!rq->dl.dl_nr_migratory); + rq->dl.dl_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_dl_task(rq, p); + rq->dl.dl_nr_migratory++; + } + + update_dl_migration(&rq->dl); +} + +/* Assumes rq->lock is held */ +static void rq_online_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_set_overload(rq); + + if (rq->dl.dl_nr_running > 0) + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); +} + +/* Assumes rq->lock is held */ +static void rq_offline_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_clear_overload(rq); + + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); +} + +void init_sched_dl_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), + GFP_KERNEL, cpu_to_node(i)); +} + +#endif /* CONFIG_SMP */ + +static void switched_from_dl(struct rq *rq, struct task_struct *p) +{ + if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) + hrtimer_try_to_cancel(&p->dl.dl_timer); + +#ifdef CONFIG_SMP + /* + * Since this might be the only -deadline task on the rq, + * this is the right place to try to pull some other one + * from an overloaded cpu, if any. + */ + if (!rq->dl.dl_nr_running) + pull_dl_task(rq); +#endif +} + +/* + * When switching to -deadline, we may overload the rq, then + * we try to push someone off, if possible. + */ +static void switched_to_dl(struct rq *rq, struct task_struct *p) +{ + int check_resched = 1; + + /* + * If p is throttled, don't consider the possibility + * of preempting rq->curr, the check will be done right + * after its runtime will get replenished. + */ + if (unlikely(p->dl.dl_throttled)) + return; + + if (p->on_rq && rq->curr != p) { +#ifdef CONFIG_SMP + if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + /* Only reschedule if pushing failed */ + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + } +} + +/* + * If the scheduling parameters of a -deadline task changed, + * a push or pull operation might be needed. + */ +static void prio_changed_dl(struct rq *rq, struct task_struct *p, + int oldprio) +{ + if (p->on_rq || rq->curr == p) { +#ifdef CONFIG_SMP + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + pull_dl_task(rq); + + /* + * If we now have a earlier deadline task than p, + * then reschedule, provided p is still on this + * runqueue. + */ + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && + rq->curr == p) + resched_task(p); +#else + /* + * Again, we don't know if p has a earlier + * or later deadline, so let's blindly set a + * (maybe not needed) rescheduling point. + */ + resched_task(p); +#endif /* CONFIG_SMP */ + } else + switched_to_dl(rq, p); +} + +const struct sched_class dl_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_dl, + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + + .check_preempt_curr = check_preempt_curr_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .post_schedule = post_schedule_dl, + .task_woken = task_woken_dl, +#endif + + .set_curr_task = set_curr_task_dl, + .task_tick = task_tick_dl, + .task_fork = task_fork_dl, + .task_dead = task_dead_dl, + + .prio_changed = prio_changed_dl, + .switched_from = switched_from_dl, + .switched_to = switched_to_dl, +}; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004..627b3c34b82 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/utsname.h> +#include <linux/mempolicy.h> #include "sched.h" @@ -61,14 +62,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); @@ -85,6 +92,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); +#endif #undef PN #undef P } @@ -98,15 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - /* - * May be NULL if the underlying cgroup isn't fully-created yet - */ - if (!tg->css.cgroup) { - group_path[0] = '\0'; - return group_path; - } - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif @@ -119,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " "); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, + p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -132,6 +137,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d", task_node(p)); +#endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); #endif @@ -154,7 +162,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->on_rq || task_cpu(p) != rq_cpu) + if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); @@ -202,20 +210,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); +#endif +#endif +#ifdef CONFIG_CFS_BANDWIDTH + SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", + cfs_rq->tg->cfs_bandwidth.timer_active); + SEQ_printf(m, " .%-30s: %d\n", "throttled", + cfs_rq->throttled); + SEQ_printf(m, " .%-30s: %d\n", "throttle_count", + cfs_rq->throttle_count); #endif +#ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -253,15 +275,21 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define P(x) \ +do { \ + if (sizeof(rq->x) == 4) \ + SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + else \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ +} while (0) + #define PN(x) \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) @@ -272,7 +300,7 @@ static void print_cpu(struct seq_file *m, int cpu) P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); - P(curr->pid); + SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); P(cpu_load[0]); P(cpu_load[1]); @@ -288,11 +316,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(yld_count); - P(sched_switch); P(sched_count); P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); + P64(max_idle_balance_cost); #endif P(ttwu_count); @@ -309,6 +337,7 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -317,11 +346,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -329,7 +357,7 @@ static int sched_debug_show(struct seq_file *m, void *v) cpu_clk = local_clock(); local_irq_restore(flags); - SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -343,7 +371,7 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(cpu_clk); P(jiffies); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - P(sched_clock_stable); + P(sched_clock_stable()); #endif #undef PN #undef P @@ -363,33 +391,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} - for_each_online_cpu(cpu) - print_cpu(m, cpu); +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); - SEQ_printf(m, "\n"); + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void) @@ -404,22 +500,73 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); +#define __P(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + +static void sched_show_numa(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_NUMA_BALANCING + struct mempolicy *pol; + int node, i; + + if (p->mm) + P(mm->numa_scan_seq); + + task_lock(p); + pol = p->mempolicy; + if (pol && !(pol->flags & MPOL_F_MORON)) + pol = NULL; + mpol_get(pol); + task_unlock(p); + + SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); + + for_each_online_node(node) { + for (i = 0; i < 2; i++) { + unsigned long nr_faults = -1; + int cpu_current, home_node; + + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; + + cpu_current = !i ? (task_node(p) == node) : + (pol && node_isset(node, pol->v.nodes)); + + home_node = (p->numa_preferred_nid == node); + + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", + i, node, cpu_current, home_node, nr_faults); + } + } + + mpol_put(pol); +#endif +} + void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); SEQ_printf(m, - "---------------------------------------------------------\n"); + "---------------------------------------------------------" + "----------\n"); #define __P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) #define __PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) PN(se.exec_start); PN(se.vruntime); @@ -461,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) avg_atom = p->se.sum_exec_runtime; if (nr_switches) - do_div(avg_atom, nr_switches); + avg_atom = div64_ul(avg_atom, nr_switches); else avg_atom = -1LL; @@ -478,12 +625,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) } #endif __P(nr_switches); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); +#ifdef CONFIG_SMP + P(se.avg.runnable_avg_sum); + P(se.avg.runnable_avg_period); + P(se.avg.load_avg_contrib); + P(se.avg.decay_count); +#endif P(policy); P(prio); #undef PN @@ -497,9 +650,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } + + sched_show_numa(p, m); } void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669..fea7d3335e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,9 @@ #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/migrate.h> +#include <linux/task_work.h> #include <trace/events/sched.h> @@ -110,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible @@ -157,59 +178,61 @@ void sched_init_granularity(void) update_sysctl(); } -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - +#define WMULT_CONST (~0U) #define WMULT_SHIFT 32 -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +static void __update_inv_weight(struct load_weight *lw) +{ + unsigned long w; + + if (likely(lw->inv_weight)) + return; + + w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; +} /* - * delta *= weight / lw + * delta_exec * weight / lw.weight + * OR + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT + * + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * we're guaranteed shift stays positive because inv_weight is guaranteed to + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. + * + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus + * weight/lw.weight <= 1, and therefore our shift will also be positive. */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { - u64 tmp; + u64 fact = scale_load_down(weight); + int shift = WMULT_SHIFT; - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; + __update_inv_weight(lw); - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; + if (unlikely(fact >> 32)) { + while (fact >> 32) { + fact >>= 1; + shift--; + } } - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + /* hint to use a 32x32->64 mul */ + fact = (u64)(u32)fact * lw->inv_weight; - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); + while (fact >> 32) { + fact >>= 1; + shift--; + } + + return mul_u64_u32_shr(delta_exec, fact, shift); } @@ -259,6 +282,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update); + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -278,6 +304,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -294,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -308,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -332,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -398,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; @@ -416,20 +427,20 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec); +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); + s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) - min_vruntime = vruntime; + max_vruntime = vruntime; - return min_vruntime; + return max_vruntime; } static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) @@ -465,6 +476,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) vruntime = min_vruntime(vruntime, se->vruntime); } + /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT smp_wmb(); @@ -585,11 +597,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, /* * delta /= w */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; } @@ -597,7 +608,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) /* * The idea is to set a period in which each task runs once. * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * When there are too many tasks (sched_nr_latency) we have to stretch * this period because otherwise the slices get too small. * * p = (nr <= nl) ? l : l*nr/nl @@ -638,13 +649,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&lw, se->load.weight); load = &lw; } - slice = calc_delta_mine(slice, se->load.weight, load); + slice = __calc_delta(slice, se->load.weight, load); } return slice; } /* - * We calculate the vruntime slice of a to be inserted task + * We calculate the vruntime slice of a to-be-inserted task. * * vs = s/w */ @@ -653,55 +664,55 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); +#ifdef CONFIG_SMP +static unsigned long task_h_load(struct task_struct *p); - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); +static inline void __update_task_entity_contrib(struct sched_entity *se); - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); +/* Give new task start runnable values to heavy its load in infant time */ +void init_task_runnable_average(struct task_struct *p) +{ + u32 slice; -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif + p->se.avg.decay_count = 0; + slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; + p->se.avg.runnable_avg_sum = slice; + p->se.avg.runnable_avg_period = slice; + __update_task_entity_contrib(&p->se); +} +#else +void init_task_runnable_average(struct task_struct *p) +{ } +#endif +/* + * Update the current task's runtime statistics. + */ static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; - unsigned long delta_exec; + u64 now = rq_clock_task(rq_of(cfs_rq)); + u64 delta_exec; if (unlikely(!curr)) return; - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) return; - __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); @@ -716,7 +727,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } /* @@ -736,14 +747,14 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); } #endif schedstat_set(se->statistics.wait_start, 0); @@ -769,119 +780,1292 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock_task; + se->exec_start = rq_clock_task(rq_of(cfs_rq)); } /************************************************** * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +#ifdef CONFIG_NUMA_BALANCING +/* + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size. + */ +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +static unsigned int task_nr_scan_windows(struct task_struct *p) { - cfs_rq->task_weight += weight; + unsigned long rss = 0; + unsigned long nr_scan_pages; + + /* + * Calculations based on RSS as non-present and empty pages are skipped + * by the PTE scanner and NUMA hinting faults should be trapped based + * on resident pages + */ + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + rss = get_mm_rss(p->mm); + if (!rss) + rss = nr_scan_pages; + + rss = round_up(rss, nr_scan_pages); + return rss / nr_scan_pages; } -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan, floor; + unsigned int windows = 1; + + if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + floor = 1000 / windows; + + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); + return max_t(unsigned int, floor, scan); } -#endif -static void -account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static unsigned int task_scan_max(struct task_struct *p) { - update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); + unsigned int smin = task_scan_min(p); + unsigned int smax; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + return max(smin, smax); +} + +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + struct list_head task_list; + + struct rcu_head rcu; + nodemask_t active_nodes; + unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; + unsigned long faults[0]; +}; + +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + +pid_t task_numa_group_id(struct task_struct *p) +{ + return p->numa_group ? p->numa_group->gid : 0; +} + +static inline int task_faults_idx(int nid, int priv) +{ + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ + if (!p->numa_faults_memory) + return 0; + + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ + if (!p->numa_group) + return 0; + + return p->numa_group->faults[task_faults_idx(nid, 0)] + + p->numa_group->faults[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults_cpu[task_faults_idx(nid, 0)] + + group->faults_cpu[task_faults_idx(nid, 1)]; +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node. The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid) +{ + unsigned long total_faults; + + if (!p->numa_faults_memory) + return 0; + + total_faults = p->total_numa_faults; + + if (!total_faults) + return 0; + + return 1000 * task_faults(p, nid) / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid) +{ + if (!p->numa_group || !p->numa_group->total_faults) + return 0; + + return 1000 * group_faults(p, nid) / p->numa_group->total_faults; +} + +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + +static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long capacity_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); + +/* Cached statistics for all CPUs within a node */ +struct numa_stats { + unsigned long nr_running; + unsigned long load; + + /* Total compute capacity of CPUs on a node */ + unsigned long compute_capacity; + + /* Approximate capacity in terms of runnable tasks on a node */ + unsigned long task_capacity; + int has_free_capacity; +}; + +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct numa_stats *ns, int nid) +{ + int cpu, cpus = 0; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->nr_running += rq->nr_running; + ns->load += weighted_cpuload(cpu); + ns->compute_capacity += capacity_of(cpu); + + cpus++; } - cfs_rq->nr_running++; + + /* + * If we raced with hotplug and there are no CPUs left in our mask + * the @ns structure is NULL'ed and task_numa_compare() will + * not find this node attractive. + * + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. + */ + if (!cpus) + return; + + ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } -static void -account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +struct task_numa_env { + struct task_struct *p; + + int src_cpu, src_nid; + int dst_cpu, dst_nid; + + struct numa_stats src_stats, dst_stats; + + int imbalance_pct; + + struct task_struct *best_task; + long best_imp; + int best_cpu; +}; + +static void task_numa_assign(struct task_numa_env *env, + struct task_struct *p, long imp) { - update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); - list_del_init(&se->group_node); + if (env->best_task) + put_task_struct(env->best_task); + if (p) + get_task_struct(p); + + env->best_task = p; + env->best_imp = imp; + env->best_cpu = env->dst_cpu; +} + +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); +} + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static void task_numa_compare(struct task_numa_env *env, + long taskimp, long groupimp) +{ + struct rq *src_rq = cpu_rq(env->src_cpu); + struct rq *dst_rq = cpu_rq(env->dst_cpu); + struct task_struct *cur; + long orig_src_load, src_load; + long orig_dst_load, dst_load; + long load; + long imp = (groupimp > 0) ? groupimp : taskimp; + + rcu_read_lock(); + cur = ACCESS_ONCE(dst_rq->curr); + if (cur->pid == 0) /* idle */ + cur = NULL; + + /* + * "imp" is the fault differential for the source task between the + * source and destination node. Calculate the total differential for + * the source task and potential destination task. The more negative + * the value is, the more rmeote accesses that would be expected to + * be incurred if the tasks were swapped. + */ + if (cur) { + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) + goto unlock; + + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + /* + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. + */ + if (cur->numa_group) + imp -= imp/16; + } else { + /* + * Compare the group weights. If a task is all by + * itself (not part of a group), use the task weight + * instead. + */ + if (env->p->numa_group) + imp = groupimp; + else + imp = taskimp; + + if (cur->numa_group) + imp += group_weight(cur, env->src_nid) - + group_weight(cur, env->dst_nid); + else + imp += task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + } } - cfs_rq->nr_running--; + + if (imp < env->best_imp) + goto unlock; + + if (!cur) { + /* Is there capacity at our destination? */ + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) + goto unlock; + + goto balance; + } + + /* Balance doesn't matter much if we're running a task per cpu */ + if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + goto assign; + + /* + * In the overloaded case, try and keep the load balanced. + */ +balance: + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; + + /* XXX missing capacity terms */ + load = task_h_load(env->p); + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; + + if (cur) { + load = task_h_load(cur); + dst_load -= load; + src_load += load; + } + + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) + goto unlock; + +assign: + task_numa_assign(env, cur, imp); +unlock: + rcu_read_unlock(); } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -# ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) +static void task_numa_find_cpu(struct task_numa_env *env, + long taskimp, long groupimp) { - struct task_group *tg = cfs_rq->tg; - long load_avg; + int cpu; + + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { + /* Skip this CPU if the source task cannot migrate */ + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) + continue; + + env->dst_cpu = cpu; + task_numa_compare(env, taskimp, groupimp); + } +} + +static int task_numa_migrate(struct task_struct *p) +{ + struct task_numa_env env = { + .p = p, + + .src_cpu = task_cpu(p), + .src_nid = task_node(p), + + .imbalance_pct = 112, + + .best_task = NULL, + .best_imp = 0, + .best_cpu = -1 + }; + struct sched_domain *sd; + unsigned long taskweight, groupweight; + int nid, ret; + long taskimp, groupimp; + + /* + * Pick the lowest SD_NUMA domain, as that would have the smallest + * imbalance and would be the first to start moving tasks about. + * + * And we want to avoid any moving of tasks about, as that would create + * random movement of tasks -- counter the numa conditions we're trying + * to satisfy here. + */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + if (sd) + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + rcu_read_unlock(); + + /* + * Cpusets can break the scheduler domain tree into smaller + * balance domains, some of which do not cross NUMA boundaries. + * Tasks that are "trapped" in such domains cannot be migrated + * elsewhere, so there is no point in (re)trying. + */ + if (unlikely(!sd)) { + p->numa_preferred_nid = task_node(p); + return -EINVAL; + } + + taskweight = task_weight(p, env.src_nid); + groupweight = group_weight(p, env.src_nid); + update_numa_stats(&env.src_stats, env.src_nid); + env.dst_nid = p->numa_preferred_nid; + taskimp = task_weight(p, env.dst_nid) - taskweight; + groupimp = group_weight(p, env.dst_nid) - groupweight; + update_numa_stats(&env.dst_stats, env.dst_nid); + + /* If the preferred nid has free capacity, try to use it. */ + if (env.dst_stats.has_free_capacity) + task_numa_find_cpu(&env, taskimp, groupimp); + + /* No space available on the preferred nid. Look elsewhere. */ + if (env.best_cpu == -1) { + for_each_online_node(nid) { + if (nid == env.src_nid || nid == p->numa_preferred_nid) + continue; + + /* Only consider nodes where both task and groups benefit */ + taskimp = task_weight(p, nid) - taskweight; + groupimp = group_weight(p, nid) - groupweight; + if (taskimp < 0 && groupimp < 0) + continue; + + env.dst_nid = nid; + update_numa_stats(&env.dst_stats, env.dst_nid); + task_numa_find_cpu(&env, taskimp, groupimp); + } + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; + + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + + /* + * Reset the scan period if the task is being rescheduled on an + * alternative node to recheck if the tasks is now properly placed. + */ + p->numa_scan_period = task_scan_min(p); + + if (env.best_task == NULL) { + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + return ret; + } + + ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + put_task_struct(env.best_task); + return ret; +} + +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) +{ + unsigned long interval = HZ; + + /* This task has no NUMA fault statistics yet */ + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + return; + + /* Periodically retry migrating the task to the preferred node */ + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; + + /* Success if task is already running on preferred CPU */ + if (task_node(p) == p->numa_preferred_nid) + return; + + /* Otherwise, try migrate to a CPU on the preferred node */ + task_numa_migrate(p); +} + +/* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (!node_isset(nid, numa_group->active_nodes)) { + if (faults > max_faults * 6 / 16) + node_set(nid, numa_group->active_nodes); + } else if (faults < max_faults * 3 / 16) + node_clear(nid, numa_group->active_nodes); + } +} + +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) +{ + unsigned int period_slot; + int ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is areas that are not of interest + * to automatic numa balancing. Scan slower + */ + if (local + shared == 0) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); + + return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + if (ratio >= NUMA_PERIOD_THRESHOLD) { + int slot = ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + + /* + * Scale scan rate increases based on sharing. There is an + * inverse relationship between the degree of sharing and + * the adjustment made to the scanning period. Broadly + * speaking the intent is that there is little point + * scanning faster if shared accesses dominate as it may + * simply bounce migrations uselessly + */ + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + +static void task_numa_placement(struct task_struct *p) +{ + int seq, nid, max_nid = -1, max_group_nid = -1; + unsigned long max_faults = 0, max_group_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; + spinlock_t *group_lock = NULL; + + seq = ACCESS_ONCE(p->mm->numa_scan_seq); + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + p->numa_scan_period_max = task_scan_max(p); + + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + + /* If the task is part of a group prevent parallel updates to group stats */ + if (p->numa_group) { + group_lock = &p->numa_group->lock; + spin_lock_irq(group_lock); + } + + /* Find the node with the highest number of faults */ + for_each_online_node(nid) { + unsigned long faults = 0, group_faults = 0; + int priv, i; + + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + long diff, f_diff, f_weight; + + i = task_faults_idx(nid, priv); + + /* Decay existing window, copy faults since last scan */ + diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; + + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); + f_diff = f_weight - p->numa_faults_cpu[i] / 2; + p->numa_faults_buffer_cpu[i] = 0; + + p->numa_faults_memory[i] += diff; + p->numa_faults_cpu[i] += f_diff; + faults += p->numa_faults_memory[i]; + p->total_numa_faults += diff; + if (p->numa_group) { + /* safe because we can only change our own group */ + p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; + p->numa_group->total_faults += diff; + group_faults += p->numa_group->faults[i]; + } + } + + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + + if (group_faults > max_group_faults) { + max_group_faults = group_faults; + max_group_nid = nid; + } + } + + update_task_scan_period(p, fault_types[0], fault_types[1]); + + if (p->numa_group) { + update_numa_active_node_mask(p->numa_group); + /* + * If the preferred task and group nids are different, + * iterate over the nodes again to find the best place. + */ + if (max_nid != max_group_nid) { + unsigned long weight, max_weight = 0; + + for_each_online_node(nid) { + weight = task_weight(p, nid) + group_weight(p, nid); + if (weight > max_weight) { + max_weight = weight; + max_nid = nid; + } + } + } + + spin_unlock_irq(group_lock); + } + + /* Preferred node as the node with the most faults */ + if (max_faults && max_nid != p->numa_preferred_nid) { + /* Update the preferred nid and migrate task if possible */ + sched_setnuma(p, max_nid); + numa_migrate_preferred(p); + } +} + +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void task_numa_group(struct task_struct *p, int cpupid, int flags, + int *priv) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 4*nr_node_ids*sizeof(unsigned long); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; + + node_set(task_node(current), grp->active_nodes); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] = p->numa_faults_memory[i]; + + grp->total_faults = p->total_numa_faults; + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto no_join; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto no_join; + + my_grp = p->numa_group; + if (grp == my_grp) + goto no_join; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto no_join; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto no_join; + + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* Simple filter to avoid false positives due to PID collisions */ + if (flags & TNF_SHARED) + join = true; + + /* Update priv based on whether false sharing was detected */ + *priv = !join; + + if (join && !get_numa_group(grp)) + goto no_join; - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; + rcu_read_unlock(); + + if (!join) + return; + + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; } + my_grp->total_faults -= p->total_numa_faults; + grp->total_faults += p->total_numa_faults; + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock_irq(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); + return; + +no_join: + rcu_read_unlock(); + return; } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +void task_numa_free(struct task_struct *p) { - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; + struct numa_group *grp = p->numa_group; + void *numa_faults = p->numa_faults_memory; + unsigned long flags; + int i; + + if (grp) { + spin_lock_irqsave(&grp->lock, flags); + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] -= p->numa_faults_memory[i]; + grp->total_faults -= p->total_numa_faults; + + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock_irqrestore(&grp->lock, flags); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; + kfree(numa_faults); +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) +{ + struct task_struct *p = current; + bool migrated = flags & TNF_MIGRATED; + int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); + int priv; + + if (!numabalancing_enabled) + return; + + /* for example, ksmd faulting in a user's mm */ + if (!p->mm) + return; - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) + /* Do not worry about placement if exiting */ + if (p->state == TASK_DEAD) return; - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; + /* Allocate buffer to track faults on a per-node basis */ + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; + + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) + return; - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; + BUG_ON(p->numa_faults_buffer_memory); + /* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); + p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; + /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv && !(flags & TNF_NO_GROUP)) + task_numa_group(p, last_cpupid, flags, &priv); } - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + + task_numa_placement(p); + + /* + * Retry task to preferred node migration periodically, in case it + * case it previously failed, or the scheduler moved us. + */ + if (time_after(jiffies, p->numa_migrate_retry)) + numa_migrate_preferred(p); + + if (migrated) + p->numa_pages_migrated += pages; + + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults_locality[local] += pages; +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long start, end; + unsigned long nr_pte_updates = 0; + long pages; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + if (!mm->numa_next_scan) { + mm->numa_next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + } + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) { + p->numa_scan_period_max = task_scan_max(p); + p->numa_scan_period = task_scan_min(p); + } + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + /* + * Delay this task enough that another task of this mm will likely win + * the next time around. + */ + p->node_stamp += 2 * TICK_NSEC; + + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + if (!pages) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + if (!vma) { + reset_ptenuma_scan(p); + start = 0; + vma = mm->mmap; + } + for (; vma; vma = vma->vm_next) { + if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + continue; + + /* + * Shared library pages mapped by multiple processes are not + * migrated as it is expected they are cache replicated. Avoid + * hinting faults in read-only file-backed mappings or the vdso + * as migrating the pages will be of marginal benefit. + */ + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + continue; - while (cfs_rq->load_period > period) { /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + continue; + + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + nr_pte_updates += change_prot_numa(vma, start, end); + + /* + * Scan sysctl_numa_balancing_scan_size but ensure that + * at least one PTE is updated so that unused virtual + * address space is quickly skipped. + */ + if (nr_pte_updates) + pages -= (end - start) >> PAGE_SHIFT; + + start = end; + if (pages <= 0) + goto out; + + cond_resched(); + } while (end != vma->vm_end); + } + +out: + /* + * It is possible to reach the end of the VMA list but the last few + * VMAs are not guaranteed to the vma_migratable. If they are not, we + * would find the !migratable VMA on the next scan but not reset the + * scanner to the start so check it now. + */ + if (vma) + mm->numa_scan_offset = start; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = task_scan_min(curr); + curr->node_stamp += period; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ } +#endif /* CONFIG_NUMA_BALANCING */ +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + update_load_add(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP + if (entity_is_task(se)) { + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } +#endif + cfs_rq->nr_running++; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); + } + cfs_rq->nr_running--; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -891,8 +2075,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic_long_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -916,27 +2100,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -954,6 +2122,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -973,19 +2143,513 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +#ifdef CONFIG_SMP +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + unsigned int local_n; + + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val *= runnable_avg_yN_inv[local_n]; + /* We don't use SRR here since we always want to round down. */ + return val >> 32; +} + +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + */ +static u32 __compute_runnable_contrib(u64 n) +{ + u32 contrib = 0; + + if (likely(n <= LOAD_AVG_PERIOD)) + return runnable_avg_yN_sum[n]; + else if (unlikely(n >= LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable) { + u64 delta, periods; + u32 runnable_contrib; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return 0; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; + + return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + long tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic_long_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib; + + /* The fraction of a cpu used by this cfs_rq */ + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + } } + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + int runnable_avg; + + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div_u64(contrib, + atomic_long_read(&tg->load_avg) + 1); + + /* + * For group entities we need to compute a correction term in the case + * that they are consuming <1 cpu so that we would contribute the same + * load as a task of equal weight. + * + * Explicitly co-ordinating this measurement would be expensive, but + * fortunately the sum of each cpus contribution forms a usable + * lower-bound on the true value. + * + * Consider the aggregate of 2 contributions. Either they are disjoint + * (and the sum represents true value) or they are disjoint and we are + * understating by the aggregate of their overlap. + * + * Extending this to N cpus, for a given overlap, the maximum amount we + * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of + * cpus that overlap for this interval and w_i is the interval width. + * + * On a small machine; the first term is well-bounded which bounds the + * total error since w_i is a subset of the period. Whereas on a + * larger machine, while this first term can be larger, if w_i is the + * of consequential size guaranteed to see n_i*w_i quickly converge to + * our upper bound of 1-cpu. + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < NICE_0_LOAD) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib >>= NICE_0_SHIFT; + } +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); + __update_group_entity_contrib(se); + } + + return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + u64 now; + + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays && !force_update) + return; + + if (atomic_long_read(&cfs_rq->removed_load)) { + unsigned long removed_load; + removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } + + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) +{ + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + * + * Newly forked tasks are enqueued with se->avg.decay_count == 0, they + * are seen by enqueue_entity_load_avg() as a migration with an already + * constructed load_avg_contrib. + */ + if (unlikely(se->avg.decay_count <= 0)) { + se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + /* Indicate that we're now synchronized and on-rq */ + se->avg.decay_count = 0; + } + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } + + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) +{ + update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); + + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ +} + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + +static int idle_balance(struct rq *this_rq); + +#else /* CONFIG_SMP */ + +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} + +static inline int idle_balance(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SMP */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -995,7 +2659,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) tsk = task_of(se); if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; @@ -1003,6 +2667,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; + se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1011,7 +2676,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } } if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; if ((s64)delta < 0) delta = 0; @@ -1019,6 +2684,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; + se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1088,9 +2754,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -1100,7 +2764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). + * through calling update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; @@ -1109,7 +2773,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1134,10 +2798,10 @@ static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else + if (cfs_rq->last != se) break; + + cfs_rq->last = NULL; } } @@ -1145,10 +2809,10 @@ static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else + if (cfs_rq->next != se) break; + + cfs_rq->next = NULL; } } @@ -1156,10 +2820,10 @@ static void __clear_buddies_skip(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else + if (cfs_rq->skip != se) break; + + cfs_rq->skip = NULL; } } @@ -1175,7 +2839,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -1184,6 +2848,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1192,9 +2857,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); } #endif } @@ -1204,7 +2869,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); /* @@ -1302,17 +2966,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *se; + + /* + * If curr is set we have to see if its left of the leftmost entity + * still in the tree, provided there was anything in the tree at all. + */ + if (!left || (curr && entity_before(curr, left))) + left = curr; + + se = left; /* ideally we run the leftmost entity */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); + struct sched_entity *second; + + if (se == curr) { + second = __pick_first_entity(cfs_rq); + } else { + second = __pick_next_entity(se); + if (!second || (curr && entity_before(curr, second))) + second = curr; + } + if (second && wakeup_preempt_entity(second, left) < 1) se = second; } @@ -1334,7 +3017,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -1353,6 +3036,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1366,9 +3051,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_curr(cfs_rq); /* - * Update share accounting for long-running entities. + * Ensure that runnable average is periodically updated. */ - update_entity_shares_tick(cfs_rq); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); + update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK /* @@ -1399,20 +3086,21 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_inc(void) +{ + static_key_slow_inc(&__cfs_bandwidth_used); } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_dec(void) { - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -1420,7 +3108,8 @@ static bool cfs_bandwidth_used(void) return true; } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {} #endif /* HAVE_JUMP_LABEL */ /* @@ -1461,6 +3150,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1483,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -1514,10 +3212,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) return; if (cfs_rq->runtime_remaining < 0) @@ -1529,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -1541,8 +3240,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) } } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; @@ -1559,8 +3257,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, resched_task(rq_of(cfs_rq)->curr); } -static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -1605,14 +3303,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - + cfs_rq->throttled_clock_task; } #endif @@ -1624,9 +3317,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ + /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq_clock_task(rq); cfs_rq->throttle_count++; return 0; @@ -1641,7 +3334,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -1662,12 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -1679,16 +3374,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) int enqueue = 1; long task_delta; - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + se = cfs_rq->tg->se[cpu_of(rq)]; cfs_rq->throttled = 0; + + update_rq_clock(rq); + raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; - update_rq_clock(rq); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -1710,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -1764,28 +3460,35 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; + + /* + * if we have relooped after returning idle once, we need to update our + * status as actually running, so that other cpus doing + * __start_cfs_bandwidth will stop trying to cancel us. + */ + cfs_b->timer_active = 1; __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ @@ -1825,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -1840,7 +3543,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -1916,10 +3625,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + raw_spin_lock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { + raw_spin_unlock(&cfs_b->lock); return; + } - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; @@ -1963,28 +3674,25 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { if (!cfs_bandwidth_used()) - return; + return false; if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; + return false; /* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished. */ if (cfs_rq_throttled(cfs_rq)) - return; + return true; throttle_cfs_rq(cfs_rq); + return true; } -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -2002,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -2011,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -2036,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -2044,14 +3754,14 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + while (unlikely(hrtimer_active(&cfs_b->period_timer)) && + hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { + /* bounce the lock to allow do_sched_cfs_period_timer to run */ raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - + cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } @@ -2065,13 +3775,11 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) continue; @@ -2079,18 +3787,22 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } } #else /* CONFIG_CFS_BANDWIDTH */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_clock_task(rq_of(cfs_rq)); +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -2119,7 +3831,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -2220,12 +3932,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } - if (!se) - inc_nr_running(rq); + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); + add_nr_running(rq, 1); + } hrtick_update(rq); } @@ -2279,12 +3993,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } - if (!se) - dec_nr_running(rq); + if (!se) { + sub_nr_running(rq, 1); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -2292,7 +4008,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return cpu_rq(cpu)->cfs.runnable_load_avg; } /* @@ -2328,22 +4044,40 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) - return rq->load.weight / nr_running; + return load_avg / nr_running; return 0; } +static void record_wakee(struct task_struct *p) +{ + /* + * Rough decay (wiping) for cost saving, don't worry + * about the boundary, really active task won't care + * about the loss. + */ + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} static void task_waking_fair(struct task_struct *p) { @@ -2364,6 +4098,7 @@ static void task_waking_fair(struct task_struct *p) #endif se->vruntime -= min_vruntime; + record_wakee(p); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2474,14 +4209,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { return wl; } #endif +static int wake_wide(struct task_struct *p) +{ + int factor = this_cpu_read(sd_llc_size); + + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -2491,6 +4247,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -2526,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -2571,12 +4334,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) + int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + do { unsigned long load, avg_load; int local_group; @@ -2603,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -2647,31 +4414,22 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_idle_sibling(struct task_struct *p, int target) { - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); struct sched_domain *sd; struct sched_group *sg; - int i; + int i = task_cpu(p); - /* - * If the task is going to be woken-up on this cpu and if it is - * already idle, then it is the right target. - */ - if (target == cpu && idle_cpu(cpu)) - return cpu; + if (idle_cpu(target)) + return target; /* - * If the task is going to be woken-up on the cpu where it previously - * ran and if it is currently idle, then it the right target. + * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (target == prev_cpu && idle_cpu(prev_cpu)) - return prev_cpu; + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; /* * Otherwise, iterate the domains and find an elegible idle cpu. */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { sg = sd->groups; @@ -2681,7 +4439,7 @@ static int select_idle_sibling(struct task_struct *p, int target) goto next; for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) + if (i == target || !idle_cpu(i)) goto next; } @@ -2693,34 +4451,31 @@ next: } while (sg != sd->groups); } done: - rcu_read_unlock(); - return target; } /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number. * * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { @@ -2735,59 +4490,28 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) continue; /* - * If power savings logic is enabled for a domain, see if we - * are not overloaded, if so, don't balance wider. - */ - if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { - unsigned long power = 0; - unsigned long nr_running = 0; - unsigned long capacity; - int i; - - for_each_cpu(i, sched_domain_span(tmp)) { - power += power_of(i); - nr_running += cpu_rq(i)->cfs.nr_running; - } - - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - nr_running /= 2; - - if (nr_running < capacity) - want_sd = 0; - } - - /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { affine_sd = tmp; - want_affine = 0; - } - - if (!want_sd && !want_affine) break; + } - if (!(tmp->flags & sd_flag)) - continue; - - if (want_sd) + if (tmp->flags & sd_flag) sd = tmp; } - if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } while (sd) { - int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -2796,10 +4520,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) continue; } - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -2829,6 +4550,34 @@ unlock: return new_cpu; } + +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a positive decay-count. It can never + * be negative here since on-rq tasks have decay-count == 0. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic_long_add(se->avg.load_avg_contrib, + &cfs_rq->removed_load); + } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; +} #endif /* CONFIG_SMP */ static unsigned long @@ -2920,7 +4669,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as pull_task(), in which we + * This is possible from callers such as move_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -2955,7 +4704,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL)) + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -2991,26 +4740,124 @@ preempt: set_last_buddy(se); } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; + struct task_struct *p; + int new_tasks; +again: +#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; + + if (prev->sched_class != &fair_sched_class) + goto simple; + + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. + */ + + do { + struct sched_entity *curr = cfs_rq->curr; + + /* + * Since we got here without doing put_prev_entity() we also + * have to consider cfs_rq->curr. If it is still a runnable + * entity, update_curr() will update its vruntime, otherwise + * forget we've ever seen it. + */ + if (curr && curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + /* + * This call to check_cfs_rq_runtime() will do the throttle and + * dequeue its entity in the parent(s). Therefore the 'simple' + * nr_running test will indeed be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + /* + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; + int pse_depth = pse->depth; + + if (se_depth <= pse_depth) { + put_prev_entity(cfs_rq_of(pse), pse); + pse = parent_entity(pse); + } + if (se_depth >= pse_depth) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + } + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); + } + + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +simple: + cfs_rq = &rq->cfs; +#endif + + if (!cfs_rq->nr_running) + goto idle; + + put_prev_task(rq, prev); do { - se = pick_next_entity(cfs_rq); + se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); return p; + +idle: + new_tasks = idle_balance(rq); + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + if (new_tasks < 0) + return RETRY_TASK; + + if (new_tasks > 0) + goto again; + + return NULL; } /* @@ -3081,27 +4928,174 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp #ifdef CONFIG_SMP /************************************************** - * Fair scheduling class load-balancing methods: - */ + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + * W_i,n/P_i == W_j,n/P_j for all i,j (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + * W_i,0 = \Sum_j w_i,j (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) + * + * C_i is the compute capacity of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + * - infeasible weights; + * - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + * log_2 n 1 n + * \Sum { --- * --- * 2^i } = O(n) (5) + * i = 0 2^i 2^i + * `- size of each group + * | | `- number of cpus doing load-balance + * | `- freq + * `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + * log_2 n + * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) + * k = 0 + * + * And you'll find that: + * + * A^(log_2 n)_i,j != 0 for all i,j (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + * O(nm log n), n := nr_cpus, m := nr_tasks (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + * s_k,i + * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) + * S_k + * + * Where + * + * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + * rewrite all of this once again.] + */ + +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +enum fbq_type { regular, remote, all }; + +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 +#define LBF_DST_PINNED 0x04 +#define LBF_SOME_PINNED 0x08 + +struct lb_env { + struct sched_domain *sd; + + struct rq *src_rq; + int src_cpu; + + int dst_cpu; + struct rq *dst_rq; + + struct cpumask *dst_grpmask; + int new_dst_cpu; + enum cpu_idle_type idle; + long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + + unsigned int flags; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; + + enum fbq_type fbq_type; +}; /* - * pull_task - move a task from a remote runqueue to the local runqueue. + * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) +static void move_task(struct task_struct *p, struct lb_env *env) { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); } /* * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -3129,61 +5123,182 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ + struct numa_group *numa_group = rcu_dereference(p->numa_group); + int src_nid, dst_nid; + + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + !(env->sd->flags & SD_NUMA)) { + return false; + } + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid) + return false; + + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return true; + + return task_faults(p, dst_nid) > task_faults(p, src_nid); +} + + +static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +{ + struct numa_group *numa_group = rcu_dereference(p->numa_group); + int src_nid, dst_nid; + + if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) + return false; + + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + return false; + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid) + return false; + + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) + return true; + + return task_faults(p, dst_nid) < task_faults(p, src_nid); +} + +#else +static inline bool migrate_improves_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} + +static inline bool migrate_degrades_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} +#endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + env->flags |= LBF_SOME_PINNED; + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + return 0; + + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; + break; + } + } + return 0; } - *lb_flags &= ~LBF_ALL_PINNED; - if (task_running(rq, p)) { + /* Record that we found atleast one task that could run on dst_cpu */ + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } /* * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. + * 1) destination numa is preferred + * 2) task is cache cold, or + * 3) too many balance attempts have failed. */ + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + if (!tsk_cache_hot) + tsk_cache_hot = migrate_degrades_locality(p, env); - tsk_cache_hot = task_hot(p, rq->clock_task, sd); - if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { + if (migrate_improves_locality(p, env)) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } + + return 1; } - return 1; + + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* @@ -3193,65 +5308,74 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * * Called with both runqueues locked. */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; - struct cfs_rq *cfs_rq; - int pinned = 0; - for_each_leaf_cfs_rq(busiest, cfs_rq) { - list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), - busiest->cpu, this_cpu)) - break; - - if (!can_migrate_task(p, busiest, this_cpu, - sd, idle, &pinned)) - continue; + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (!can_migrate_task(p, env)) + continue; - pull_task(busiest, p, this_rq, this_cpu); - /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). - */ - schedstat_inc(sd, lb_gained[idle]); - return 1; - } + move_task(p, env); + /* + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; } - return 0; } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, - struct cfs_rq *busiest_cfs_rq) +static const unsigned int sched_nr_migrate_break = 32; + +/* + * move_tasks tries to move up to imbalance weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env) { - int loops = 0, pulled = 0; - long rem_load_move = max_load_move; - struct task_struct *p, *n; + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; + unsigned long load; + int pulled = 0; - if (max_load_move == 0) - goto out; + if (env->imbalance <= 0) + return 0; + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) + break; + + /* take a breather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sched_nr_migrate_break; + env->flags |= LBF_NEED_BREAK; break; } - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) - continue; + if (!can_migrate_task(p, env)) + goto next; + + load = task_h_load(p); + + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) + goto next; - pull_task(busiest, p, this_rq, this_cpu); + if ((load / 2) > env->imbalance) + goto next; + + move_task(p, env); pulled++; - rem_load_move -= p->se.load.weight; + env->imbalance -= load; #ifdef CONFIG_PREEMPT /* @@ -3259,273 +5383,209 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) break; - } #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ - if (rem_load_move <= 0) + if (env->imbalance <= 0) break; + + continue; +next: + list_move_tail(&p->se.group_node, tasks); } -out: + /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). */ - schedstat_add(sd, lb_gained[idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], pulled); - return max_load_move - rem_load_move; + return pulled; } #ifdef CONFIG_FAIR_GROUP_SCHED /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - raw_spin_unlock_irqrestore(&rq->lock, flags); + update_cfs_rq_blocked_load(cfs_rq, 1); - return 0; + if (se) { + update_entity_load_avg(se, 1); + /* + * We pivot on our runnable average having decayed to zero for + * list removal. This generally implies that all our children + * have also been removed (modulo rounding error or bandwidth + * control); however, such cases are rare and we can fix these + * at enqueue. + * + * TODO: fix up out-of-order children on enqueue. + */ + if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) + list_del_leaf_cfs_rq(cfs_rq); + } else { + struct rq *rq = rq_of(cfs_rq); + update_rq_runnable_avg(rq, rq->nr_running); + } } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + unsigned long flags; - rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Note: We may want to consider periodically releasing + * rq->lock about these updates so that creating many task + * groups does not result in continually extending hold time. + */ + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); } - rcu_read_unlock(); + + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* - * Compute the cpu's hierarchical load factor for each task group. + * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static int tg_load_down(struct task_group *tg, void *data) +static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + unsigned long now = jiffies; unsigned long load; - long cpu = (long)data; - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } + if (cfs_rq->last_h_load_update == now) + return; - tg->cfs_rq[cpu]->h_load = load; + cfs_rq->h_load_next = NULL; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_load_next = se; + if (cfs_rq->last_h_load_update == now) + break; + } - return 0; -} + if (!se) { + cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->last_h_load_update = now; + } -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + while ((se = cfs_rq->h_load_next) != NULL) { + load = cfs_rq->h_load; + load = div64_ul(load * se->avg.load_avg_contrib, + cfs_rq->runnable_load_avg + 1); + cfs_rq = group_cfs_rq(se); + cfs_rq->h_load = load; + cfs_rq->last_h_load_update = now; + } } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long task_h_load(struct task_struct *p) { - long rem_load_move = max_load_move; - struct cfs_rq *busiest_cfs_rq; - - rcu_read_lock(); - update_h_load(cpu_of(busiest)); - - for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; - u64 rem_load, moved_load; - - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - - /* - * empty group or part of a throttled hierarchy - */ - if (!busiest_cfs_rq->task_weight || - throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, - busiest_cfs_rq); - - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); + struct cfs_rq *cfs_rq = task_cfs_rq(p); - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); - - return max_load_move - rem_load_move; + update_cfs_rq_h_load(cfs_rq); + return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, + cfs_rq->runnable_load_avg + 1); } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long task_h_load(struct task_struct *p) { - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, - &busiest->cfs); + return p->se.avg.load_avg_contrib; } #endif -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) -{ - unsigned long total_load_moved = 0, load_moved; - - do { - load_moved = load_balance_fair(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, lb_flags); - - total_load_moved += load_moved; - - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; - break; - } -#endif - } while (load_moved && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - /********** Helpers for find_busiest_group ************************/ /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - unsigned long this_has_capacity; - unsigned int this_idle_cpus; - - /* Statistics of the busiest group */ - unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - unsigned long busiest_group_capacity; - unsigned long busiest_has_capacity; - unsigned int busiest_group_weight; - - int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance; /* Is powersave balance needed for this sd */ - struct sched_group *group_min; /* Least loaded group in sd */ - struct sched_group *group_leader; /* Group which relieves group_min */ - unsigned long min_load_per_task; /* load_per_task in group_min */ - unsigned long leader_nr_running; /* Nr running of group_leader */ - unsigned long min_nr_running; /* Nr running of group_min */ -#endif -}; - -/* * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long load_per_task; unsigned long group_capacity; - unsigned long idle_cpus; - unsigned long group_weight; + unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int group_capacity_factor; + unsigned int idle_cpus; + unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif }; +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ +}; + +static inline void init_sd_lb_stats(struct sd_lb_stats *sds) +{ + /* + * Skimp on the clearing to avoid duplicate work. We can avoid clearing + * local_stat because update_sg_lb_stats() does a full clear/assignment. + * We must however clear busiest_stat::avg_load because + * update_sd_pick_busiest() reads this before assignment. + */ + *sds = (struct sd_lb_stats){ + .busiest = NULL, + .local = NULL, + .total_load = 0UL, + .total_capacity = 0UL, + .busiest_stat = { + .avg_load = 0UL, + }, + }; +} + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * @idle: The idle status of the CPU for whose sd load_idx is obtained. + * + * Return: The load index. */ static inline int get_sd_load_idx(struct sched_domain *sd, enum cpu_idle_type idle) @@ -3548,159 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } + return SCHED_CAPACITY_SCALE; } -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { - - if (!sds->power_savings_balance) - return; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (sds->this_nr_running >= sgs->group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs->sum_nr_running >= sgs->group_capacity || - !sgs->sum_nr_running) - return; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs->sum_nr_running < sds->min_nr_running) || - (sgs->sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs->sum_nr_running; - sds->min_load_per_task = sgs->sum_weighted_load / - sgs->sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs->sum_nr_running + 1 > sgs->group_capacity) - return; - - if (sgs->sum_nr_running > sds->leader_nr_running || - (sgs->sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs->sum_nr_running; - } + return default_scale_capacity(sd, cpu); } -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - * under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - if (!sds->power_savings_balance) - return 0; - - if (sds->this != sds->group_leader || - sds->group_leader == sds->group_min) - return 0; - - *imbalance = sds->min_load_per_task; - sds->busiest = sds->group_min; - - return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - return; -} - -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - - -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) -{ - return SCHED_POWER_SCALE; -} - -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ - return default_scale_freq_power(sd, cpu); -} - -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -3710,87 +5628,146 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available; + u64 total, available, age_stamp, avg; + s64 delta; - total = sched_avg_period() + (rq->clock - rq->age_stamp); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); - if (unlikely(total < rq->rt_avg)) { - /* Ensures that power won't end up being negative */ + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + if (unlikely(total < avg)) { + /* Ensures that capacity won't end up being negative */ available = 0; } else { - available = total - rq->rt_avg; + available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_smt_capacity(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_freq_capacity(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_CAPACITY_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power; + unsigned long capacity, capacity_orig; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } - power = 0; + capacity_orig = capacity = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_cpus(sdg)) { + struct sched_group_capacity *sgc; + struct rq *rq = cpu_rq(cpu); + + /* + * build_sched_domains() -> init_sched_groups_capacity() + * gets here before we've attached the domains to the + * runqueues. + * + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). + * + * This avoids capacity/capacity_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct capacity_orig. + */ + if (unlikely(!rq->sd)) { + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); + continue; + } + + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; + } + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ - sdg->sgp->power = power; + group = child->groups; + do { + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; + group = group->next; + } while (group != child->groups); + } + + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -3804,142 +5781,157 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; } +/* + * Group imbalance indicates (and tries to solve) the problem where balancing + * groups is inadequate due to tsk_cpus_allowed() constraints. + * + * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a + * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Something like: + * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * + * + * If we were to balance group-wise we'd place two tasks in the first group and + * two tasks in the second group. Clearly this is undesired as it will overload + * cpu 3 and leave one of the cpus in the second group unused. + * + * The current solution to this issue is detecting the skew in the first group + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints. + * + * When this is so detected; this group becomes a candidate for busiest; see + * update_sd_pick_busiest(). And calculate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditions to allow it + * to create an effective group imbalance. + * + * This is a somewhat tricky proposition since the next run might not find the + * group imbalance and decide the groups need to be balanced again. A most + * subtle and fragile situation. + */ + +static inline int sg_imbalanced(struct sched_group *group) +{ + return group->sgc->imbalance; +} + +/* + * Compute the group capacity factor. + * + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit unit capacity with that. + */ +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +{ + unsigned int capacity_factor, smt, cpus; + unsigned int capacity, capacity_orig; + + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; + cpus = group->group_weight; + + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); + capacity_factor = cpus / smt; /* cores */ + + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); + + return capacity_factor; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. - * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_domain *sd, - struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, - int local_group, const struct cpumask *cpus, - int *balance, struct sg_lb_stats *sgs) +static inline void update_sg_lb_stats(struct lb_env *env, + struct sched_group *group, int load_idx, + int local_group, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; + unsigned long load; int i; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long avg_load_per_task = 0; - if (local_group) - balance_cpu = group_first_cpu(group); + memset(sgs, 0, sizeof(*sgs)); - /* Tally up the load of all CPUs in the group */ - max_cpu_load = 0; - min_cpu_load = ~0UL; - max_nr_running = 0; - - for_each_cpu_and(i, sched_group_cpus(group), cpus) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - + if (local_group) load = target_load(i, load_idx); - } else { + else load = source_load(i, load_idx); - if (load > max_cpu_load) { - max_cpu_load = load; - max_nr_running = rq->nr_running; - } - if (min_cpu_load > load) - min_cpu_load = load; - } sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; +#ifdef CONFIG_NUMA_BALANCING + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; +#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; } - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); - } - - /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ if (sgs->sum_nr_running) - avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) - sgs->group_imb = 1; - - sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, - SCHED_POWER_SCALE); - if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(sd, group); sgs->group_weight = group->group_weight; - if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + sgs->group_imb = sg_imbalanced(group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); + + if (sgs->group_capacity_factor > sgs->sum_nr_running) + sgs->group_has_free_capacity = 1; } /** * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics - * @this_cpu: the current cpu * * Determine if @sg is a busier group than the previously selected * busiest group. + * + * Return: %true if @sg is a busier group than the previously selected + * busiest group. %false otherwise. */ -static bool update_sd_pick_busiest(struct sched_domain *sd, +static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, - struct sg_lb_stats *sgs, - int this_cpu) + struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->max_load) + if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -3950,8 +5942,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - this_cpu < group_first_cpu(sg)) { + if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -3962,79 +5954,101 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, return false; } +#ifdef CONFIG_NUMA_BALANCING +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->nr_numa_running) + return regular; + if (sgs->sum_nr_running > sgs->nr_preferred_running) + return remote; + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + if (rq->nr_running > rq->nr_numa_running) + return regular; + if (rq->nr_running > rq->nr_preferred_running) + return remote; + return all; +} +#else +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + return regular; +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu - * @cpus: Set of cpus considered for load balancing. - * @balance: Should we balance. + * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = sd->child; - struct sched_group *sg = sd->groups; - struct sg_lb_stats sgs; + struct sched_domain *child = env->sd->child; + struct sched_group *sg = env->sd->groups; + struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; - init_sd_power_savings_stats(sd, sds, idle); - load_idx = get_sd_load_idx(sd, idle); + load_idx = get_sd_load_idx(env->sd, env->idle); do { + struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); - memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, - local_group, cpus, balance, &sgs); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + if (local_group) { + sds->local = sg; + sgs = &sds->local_stat; - if (local_group && !(*balance)) - return; + if (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); + } + + update_sg_lb_stats(env, sg, load_idx, local_group, sgs); - sds->total_load += sgs.group_load; - sds->total_pwr += sg->sgp->power; + if (local_group) + goto next_group; /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && sds->this_has_capacity) - sgs.group_capacity = min(sgs.group_capacity, 1UL); + if (prefer_sibling && sds->local && + sds->local_stat.group_has_free_capacity) + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - if (local_group) { - sds->this_load = sgs.avg_load; - sds->this = sg; - sds->this_nr_running = sgs.sum_nr_running; - sds->this_load_per_task = sgs.sum_weighted_load; - sds->this_has_capacity = sgs.group_has_capacity; - sds->this_idle_cpus = sgs.idle_cpus; - } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { - sds->max_load = sgs.avg_load; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; - sds->busiest_nr_running = sgs.sum_nr_running; - sds->busiest_idle_cpus = sgs.idle_cpus; - sds->busiest_group_capacity = sgs.group_capacity; - sds->busiest_load_per_task = sgs.sum_weighted_load; - sds->busiest_has_capacity = sgs.group_has_capacity; - sds->busiest_group_weight = sgs.group_weight; - sds->group_imb = sgs.group_imb; + sds->busiest_stat = *sgs; } - update_sd_power_savings_stats(sg, sds, local_group, &sgs); +next_group: + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_capacity += sgs->group_capacity; + sg = sg->next; - } while (sg != sd->groups); + } while (sg != env->sd->groups); + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); } /** @@ -4054,32 +6068,30 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, * assuming lower CPU number will be equivalent to lower a SMT thread * number. * - * Returns 1 when packing is required and a task should be moved to + * Return: 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. */ -static int check_asym_packing(struct sched_domain *sd, - struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) +static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { int busiest_cpu; - if (!(sd->flags & SD_ASYM_PACKING)) + if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; if (!sds->busiest) return 0; busiest_cpu = group_first_cpu(sds->busiest); - if (this_cpu > busiest_cpu) + if (env->dst_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, - SCHED_POWER_SCALE); + env->imbalance = DIV_ROUND_CLOSEST( + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, + SCHED_CAPACITY_SCALE); + return 1; } @@ -4087,110 +6099,117 @@ static int check_asym_packing(struct sched_domain *sd, * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. */ -static inline void fix_small_imbalance(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) +static inline +void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; + struct sg_lb_stats *local, *busiest; - if (sds->this_nr_running) { - sds->this_load_per_task /= sds->this_nr_running; - if (sds->busiest_load_per_task > - sds->this_load_per_task) - imbn = 1; - } else - sds->this_load_per_task = - cpu_avg_load_per_task(this_cpu); + local = &sds->local_stat; + busiest = &sds->busiest_stat; + + if (!local->sum_nr_running) + local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); + else if (busiest->load_per_task > local->load_per_task) + imbn = 1; - scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->sgp->power; + scaled_busy_load_per_task = + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + busiest->group_capacity; - if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { - *imbalance = sds->busiest_load_per_task; + if (busiest->avg_load + scaled_busy_load_per_task >= + local->avg_load + (scaled_busy_load_per_task * imbn)) { + env->imbalance = busiest->load_per_task; return; } /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ - pwr_now += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now += busiest->group_capacity * + min(busiest->load_per_task, busiest->avg_load); + capa_now += local->group_capacity * + min(local->load_per_task, local->avg_load); + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; - if (sds->max_load > tmp) - pwr_move += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load - tmp); + if (busiest->avg_load > scaled_busy_load_per_task) { + capa_move += busiest->group_capacity * + min(busiest->load_per_task, + busiest->avg_load - scaled_busy_load_per_task); + } /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->sgp->power < - sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->sgp->power) / - sds->this->sgp->power; - else - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->sgp->power; - pwr_move += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + if (busiest->avg_load * busiest->group_capacity < + busiest->load_per_task * SCHED_CAPACITY_SCALE) { + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; + } else { + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + local->group_capacity; + } + capa_move += local->group_capacity * + min(local->load_per_task, local->avg_load + tmp); + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = sds->busiest_load_per_task; + if (capa_move > capa_now) + env->imbalance = busiest->load_per_task; } /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. + * @env: load balance environment * @sds: statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: Cpu for which currently load balance is being performed. - * @imbalance: The variable to store the imbalance. */ -static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, - unsigned long *imbalance) +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { unsigned long max_pull, load_above_capacity = ~0UL; + struct sg_lb_stats *local, *busiest; - sds->busiest_load_per_task /= sds->busiest_nr_running; - if (sds->group_imb) { - sds->busiest_load_per_task = - min(sds->busiest_load_per_task, sds->avg_load); + local = &sds->local_stat; + busiest = &sds->busiest_stat; + + if (busiest->group_imb) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure cpu-load equilibrium, look at wider averages. XXX + */ + busiest->load_per_task = + min(busiest->load_per_task, sds->avg_load); } /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ - if (sds->max_load < sds->avg_load) { - *imbalance = 0; - return fix_small_imbalance(sds, this_cpu, imbalance); + if (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return fix_small_imbalance(env, sds); } - if (!sds->group_imb) { + if (!busiest->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. + * Except of course for the group_imb case, since then we might + * have to drop below capacity to reach cpu-load equilibrium. */ - load_above_capacity = (sds->busiest_nr_running - - sds->busiest_group_capacity); + load_above_capacity = + (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - - load_above_capacity /= sds->busiest->sgp->power; + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); + load_above_capacity /= busiest->group_capacity; } /* @@ -4200,15 +6219,14 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * we also don't want to reduce the group load below the group capacity * (so that we can implement power-savings policies etc). Thus we look * for the minimum possible imbalance. - * Be careful of negative numbers as they'll appear as very large values - * with unsigned longs. */ - max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->sgp->power, - (sds->avg_load - sds->this_load) * sds->this->sgp->power) - / SCHED_POWER_SCALE; + env->imbalance = min( + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -4216,9 +6234,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < sds->busiest_load_per_task) - return fix_small_imbalance(sds, this_cpu, imbalance); - + if (env->imbalance < busiest->load_per_task) + return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ @@ -4233,159 +6250,163 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * Also calculates the amount of weighted load which should be moved * to restore balance. * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. - * @cpus: The set of CPUs under consideration for load-balancing. - * @balance: Pointer to a variable indicating if this_cpu - * is the appropriate cpu to perform load balancing at this_level. + * @env: The load balancing environment. * - * Returns: - the busiest group if imbalance exists. + * Return: - The busiest group if imbalance exists. * - If no imbalance and user has opted for power-savings balance, * return the least loaded group whose CPUs can be * put to idle by rebalancing its tasks onto our group. */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - const struct cpumask *cpus, int *balance) +static struct sched_group *find_busiest_group(struct lb_env *env) { + struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; - memset(&sds, 0, sizeof(sds)); + init_sd_lb_stats(&sds); /* * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + update_sd_lb_stats(env, &sds); + local = &sds.local_stat; + busiest = &sds.busiest_stat; - /* - * this_cpu is not the appropriate cpu to perform load balancing at - * this level. - */ - if (!(*balance)) - goto ret; - - if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && - check_asym_packing(sd, &sds, this_cpu, imbalance)) + if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && + check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || sds.busiest_nr_running == 0) + if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't - * work because they assumes all things are equal, which typically + * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (sds.group_imb) + if (busiest->group_imb) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && - !sds.busiest_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* * If the local group is more busy than the selected busiest group * don't try and pull any tasks. */ - if (sds.this_load >= sds.max_load) + if (local->avg_load >= busiest->avg_load) goto out_balanced; /* * Don't pull any tasks if this group is already above the domain * average load. */ - if (sds.this_load >= sds.avg_load) + if (local->avg_load >= sds.avg_load) goto out_balanced; - if (idle == CPU_IDLE) { + if (env->idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && - sds.busiest_nr_running <= sds.busiest_group_weight) + if ((local->idle_cpus < busiest->idle_cpus) && + busiest->sum_nr_running <= busiest->group_weight) goto out_balanced; } else { /* * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use * imbalance_pct to be conservative. */ - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + if (100 * busiest->avg_load <= + env->sd->imbalance_pct * local->avg_load) goto out_balanced; } force_balance: /* Looks like there is an imbalance. Compute it */ - calculate_imbalance(&sds, this_cpu, imbalance); + calculate_imbalance(env, &sds); return sds.busiest; out_balanced: - /* - * There is no obvious imbalance. But check if we can do some balancing - * to save power. - */ - if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) - return sds.busiest; -ret: - *imbalance = 0; + env->imbalance = 0; return NULL; } /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static struct rq * -find_busiest_queue(struct sched_domain *sd, struct sched_group *group, - enum cpu_idle_type idle, unsigned long imbalance, - const struct cpumask *cpus) +static struct rq *find_busiest_queue(struct lb_env *env, + struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu(i, sched_group_cpus(group)) { - unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, - SCHED_POWER_SCALE); - unsigned long wl; + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + unsigned long capacity, capacity_factor, wl; + enum fbq_type rt; - if (!capacity) - capacity = fix_small_capacity(sd, group); + rq = cpu_rq(i); + rt = fbq_classify_rq(rq); - if (!cpumask_test_cpu(i, cpus)) + /* + * We classify groups/runqueues into three groups: + * - regular: there are !numa tasks + * - remote: there are numa tasks that run on the 'wrong' node + * - all: there is no distinction + * + * In order to avoid migrating ideally placed numa tasks, + * ignore those when there's better options. + * + * If we ignore the actual busiest queue to migrate another + * task, the next balance pass can still reduce the busiest + * queue by moving tasks around inside the node. + * + * If we cannot move enough load due to this classification + * the next pass will adjust the group classification and + * allow migration of more tasks. + * + * Both cases only affect the total convergence complexity. + */ + if (rt > env->fbq_type) continue; - rq = cpu_rq(i); + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); + wl = weighted_cpuload(i); /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ - if (capacity && rq->nr_running == 1 && wl > imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. + * + * Thus we're looking for max(wl_i / capacity_i), crosswise + * multiplication to rid ourselves of the division works out + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - wl = (wl * SCHED_POWER_SCALE) / power; - - if (wl > max_load) { - max_load = wl; + if (wl * busiest_capacity > busiest_load * capacity) { + busiest_load = wl; + busiest_capacity = capacity; busiest = rq; } } @@ -4400,42 +6421,21 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -static int need_active_balance(struct sched_domain *sd, int idle, - int busiest_cpu, int this_cpu) +static int need_active_balance(struct lb_env *env) { - if (idle == CPU_NEWLY_IDLE) { + struct sched_domain *sd = env->sd; + + if (env->idle == CPU_NEWLY_IDLE) { /* * ASYM_PACKING needs to force migrate tasks from busy but * higher numbered CPUs in order to pack all tasks in the * lowest numbered CPUs. */ - if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) return 1; - - /* - * The only task running in a non-idle cpu can be moved to this - * cpu in an attempt to completely freeup the other CPU - * package. - * - * The package power saving logic comes from - * find_busiest_group(). If there are no imbalance, then - * f_b_g() will return NULL. However when sched_mc={1,2} then - * f_b_g() will select a group from which a running task may be - * pulled to this cpu in order to make the other package idle. - * If there is no opportunity to make a package idle and if - * there are no imbalance, then f_b_g() will return NULL and no - * action will be taken in load_balance_newidle(). - * - * Under normal task pull operation due to imbalance, there - * will be more than one task in the source run queue and - * move_tasks() will succeed. ld_moved will be true and this - * active balance code will not be triggered. - */ - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) - return 0; } return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); @@ -4443,46 +6443,98 @@ static int need_active_balance(struct sched_domain *sd, int idle, static int active_load_balance_cpu_stop(void *data); +static int should_we_balance(struct lb_env *env) +{ + struct sched_group *sg = env->sd->groups; + struct cpumask *sg_cpus, *sg_mask; + int cpu, balance_cpu = -1; + + /* + * In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (env->idle == CPU_NEWLY_IDLE) + return 1; + + sg_cpus = sched_group_cpus(sg); + sg_mask = sched_group_mask(sg); + /* Try to find first idle cpu */ + for_each_cpu_and(cpu, sg_cpus, env->cpus) { + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + continue; + + balance_cpu = cpu; + break; + } + + if (balance_cpu == -1) + balance_cpu = group_balance_cpu(sg); + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above domains. + */ + return balance_cpu == env->dst_cpu; +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *continue_balancing) { - int ld_moved, lb_flags = 0, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_domain *sd_parent = sd->parent; struct sched_group *group; - unsigned long imbalance; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); + + struct lb_env env = { + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + }; + + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) + env.dst_grpmask = NULL; cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, - cpus, balance); - - if (*balance == 0) + if (!should_we_balance(&env)) { + *continue_balancing = 0; goto out_balanced; + } + group = find_busiest_group(&env); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); + busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } - BUG_ON(busiest == this_rq); + BUG_ON(busiest == env.dst_rq); - schedstat_add(sd, lb_imbalance[idle], imbalance); + schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; if (busiest->nr_running > 1) { @@ -4492,35 +6544,92 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + +more_balance: local_irq_save(flags); - double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); - double_rq_unlock(this_rq, busiest); + double_rq_lock(env.dst_rq, busiest); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; + double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); - if (lb_flags & LBF_ABORT) - goto out_balanced; + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } - if (lb_flags & LBF_NEED_BREAK) { - lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (lb_flags & LBF_ABORT) - goto out_balanced; - goto redo; + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + + env.dst_rq = cpu_rq(env.new_dst_cpu); + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_DST_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } + + /* + * We failed to reach balance because of affinity. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + *group_imbalance = 1; + } else if (*group_imbalance) + *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { + if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } @@ -4536,7 +6645,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { + if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -4547,7 +6656,7 @@ redo: tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4563,10 +6672,11 @@ redo: } raw_spin_unlock_irqrestore(&busiest->lock, flags); - if (active_balance) + if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); + } /* * We've kicked active balancing, reset the failure @@ -4600,7 +6710,7 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && + if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4610,60 +6720,135 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; + u64 curr_cost = 0; - this_rq->idle_stamp = this_rq->clock; + idle_enter_fair(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; - int balance = 1; + int continue_balancing = 1; + u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); + break; + } + if (sd->flags & SD_BALANCE_NEWIDLE) { - /* If we've pulled tasks over stop searching: */ + t0 = sched_clock_cpu(this_cpu); + pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, &balance); + sd, CPU_NEWLY_IDLE, + &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + update_next_balance(sd, 0, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; - } } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); + this_rq->idle_stamp = 0; } + + return pulled_task; } /* @@ -4710,10 +6895,18 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + schedstat_inc(sd, alb_count); - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) + if (move_one_task(&env)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -4726,7 +6919,12 @@ out_unlock: return 0; } -#ifdef CONFIG_NO_HZ +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + +#ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -4739,117 +6937,28 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu: The cpu whose lowest level of sched domain is to - * be returned. - * @flag: The flag to check for the lowest sched_domain - * for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) - if (sd->flags & flag) - break; - - return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu: The cpu whose domains we're iterating over. - * @sd: variable holding the value of the power_savings_sd - * for cpu. - * @flag: The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ - for (sd = lowest_flag_domain(cpu, flag); \ - (sd && (sd->flags & flag)); sd = sd->parent) - -/** - * find_new_ilb - Finds the optimum idle load balancer for nomination. - * @cpu: The cpu which is nominating a new idle_load_balancer. - * - * Returns: Returns the id of the idle load balancer if it exists, - * Else, returns >= nr_cpu_ids. - * - * This algorithm picks the idle load balancer such that it belongs to a - * semi-idle powersavings sched_domain. The idea is to try and avoid - * completely idle packages/cores just for the purpose of idle load balancing - * when there are other idle cpu's which are better suited for that job. - */ -static int find_new_ilb(int cpu) +static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); - struct sched_group *ilbg; - struct sched_domain *sd; - - /* - * Have idle load balancer selection from semi-idle packages only - * when power-aware load balancing is enabled - */ - if (!(sched_smt_power_savings || sched_mc_power_savings)) - goto out_done; - - /* - * Optimize for the case when we have no idle CPUs or only one - * idle CPU. Don't walk the sched_domain hierarchy in such cases - */ - if (cpumask_weight(nohz.idle_cpus_mask) < 2) - goto out_done; - - rcu_read_lock(); - for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilbg = sd->groups; - - do { - if (ilbg->group_weight != - atomic_read(&ilbg->sgp->nr_busy_cpus)) { - ilb = cpumask_first_and(nohz.idle_cpus_mask, - sched_group_cpus(ilbg)); - goto unlock; - } - - ilbg = ilbg->next; - } while (ilbg != sd->groups); - } -unlock: - rcu_read_unlock(); - -out_done: if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; return nr_cpu_ids; } -#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ -static inline int find_new_ilb(int call_cpu) -{ - return nr_cpu_ids; -} -#endif /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick the * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; @@ -4866,11 +6975,16 @@ static void nohz_balancer_kick(int cpu) return; } -static inline void clear_nohz_tick_stopped(int cpu) +static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -4880,13 +6994,15 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + atomic_inc(&sd->groups->sgc->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -4895,47 +7011,50 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + atomic_dec(&sd->groups->sgc->nr_busy_cpus); +unlock: rcu_read_unlock(); } /* - * This routine will record that this cpu is going idle with tick stopped. + * This routine will record that the cpu is going idle with tick stopped. * This info will be used in performing idle load balancing in the future. */ -void select_nohz_load_balancer(int stop_tick) +void nohz_balance_enter_idle(int cpu) { - int cpu = smp_processor_id(); - /* * If this cpu is going down, then nothing needs to be done. */ if (!cpu_active(cpu)) return; - if (stop_tick) { - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + return; - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } - return; + /* + * If we're a completely isolated CPU, we don't play. + */ + if (on_null_domain(cpu_rq(cpu))) + return; + + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } -static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, +static int sched_ilb_notifier(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DYING: - clear_nohz_tick_stopped(smp_processor_id()); + nohz_balance_exit_idle(smp_processor_id()); return NOTIFY_OK; default: return NOTIFY_DONE; @@ -4945,8 +7064,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. @@ -4960,50 +7077,69 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { - int balance = 1; - struct rq *rq = cpu_rq(cpu); + int continue_balancing = 1; + int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize; + int need_serialize, need_decay = 0; + u64 max_cost = 0; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + if (!(sd->flags & SD_LOAD_BALANCE)) continue; - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_DST_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); @@ -5012,14 +7148,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - + } + if (need_decay) { /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. */ - if (!balance) - break; + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); } rcu_read_unlock(); @@ -5032,14 +7168,14 @@ out: rq->next_balance = next_balance; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - struct rq *this_rq = cpu_rq(this_cpu); + int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; @@ -5059,14 +7195,20 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) if (need_resched()) break; - raw_spin_lock_irq(&this_rq->lock); - update_rq_clock(this_rq); - update_cpu_load(this_rq); - raw_spin_unlock_irq(&this_rq->lock); + rq = cpu_rq(balance_cpu); - rebalance_domains(balance_cpu, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } - rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } @@ -5080,16 +7222,18 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; + struct sched_group_capacity *sgc; + int nr_busy, cpu = rq->cpu; - if (unlikely(idle_cpu(cpu))) + if (unlikely(rq->idle_balance)) return 0; /* @@ -5097,7 +7241,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - clear_nohz_tick_stopped(cpu); + nohz_balance_exit_idle(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -5113,22 +7257,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) goto need_kick; rcu_read_lock(); - for_each_domain(cpu, sd) { - struct sched_group *sg = sd->groups; - struct sched_group_power *sgp = sg->sgp; - int nr_busy = atomic_read(&sgp->nr_busy_cpus); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) - goto need_kick_unlock; + if (sd) { + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight - && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) + if (nr_busy > 1) goto need_kick_unlock; - - if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) - break; } + + sd = rcu_dereference(per_cpu(sd_asym, cpu)); + + if (sd && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + rcu_read_unlock(); return 0; @@ -5138,7 +7282,7 @@ need_kick: return 1; } #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif /* @@ -5147,38 +7291,34 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); + struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(this_rq, idle); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - nohz_idle_balance(this_cpu, idle); -} - -static inline int on_null_domain(int cpu) -{ - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + nohz_idle_balance(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq) { /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) + if (unlikely(on_null_domain(rq))) + return; + + if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) - nohz_balancer_kick(cpu); +#ifdef CONFIG_NO_HZ_COMMON + if (nohz_kick_needed(rq)) + nohz_balancer_kick(); #endif } @@ -5190,6 +7330,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ @@ -5206,6 +7349,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + if (numabalancing_enabled) + task_tick_numa(rq, curr); + + update_rq_runnable_avg(rq, 1); } /* @@ -5228,11 +7376,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); @@ -5282,15 +7434,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -5298,6 +7450,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#ifdef CONFIG_SMP + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (se->avg.decay_count) { + __synchronize_entity_decay(se); + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + } +#endif } /* @@ -5305,7 +7469,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - if (!p->se.on_rq) + struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + if (!se->on_rq) return; /* @@ -5340,16 +7512,22 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#ifdef CONFIG_SMP + atomic64_set(&cfs_rq->decay_counter, 1); + atomic_long_set(&cfs_rq->removed_load, 0); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5375,14 +7553,26 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + se->depth = se->parent ? se->parent->depth + 1 : 0; + if (!on_rq) { + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * migrate_task_rq_fair() will have removed our previous + * contribution, but we must synchronize for ongoing future + * decay. + */ + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) @@ -5467,10 +7657,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; @@ -5480,13 +7666,17 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; - update_load_set(&se->load, 0); + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; } @@ -5517,6 +7707,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); + + /* Possible calls to update_curr() need rq clock */ + update_rq_clock(rq); for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5550,7 +7743,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); return rr_interval; } @@ -5572,6 +7765,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, @@ -5611,7 +7805,8 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON + nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d..90284d117fe 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) SCHED_FEAT(START_DEBIT, true) /* - * Based on load and program behaviour, see if it makes sense to place - * a newly woken task on the same cpu as the task that woke it -- - * improve cache locality. Typically used with SYNC wakeups as - * generated by pipes and the like, see also SYNC_WAKEUPS. - */ -SCHED_FEAT(AFFINE_WAKEUPS, true) - -/* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. @@ -40,25 +32,23 @@ SCHED_FEAT(LAST_BUDDY, true) SCHED_FEAT(CACHE_HOT_BUDDY, true) /* - * Use arch dependent cpu power functions + * Allow wakeup-time preemption of the current task: */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +/* + * Use arch dependent cpu capacity functions + */ +SCHED_FEAT(ARCH_CAPACITY, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/* - * Decrement CPU power based on time not spent running tasks + * Decrement CPU capacity based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_CAPACITY, true) /* * Queue remote wakeups on the target CPU and process them @@ -68,3 +58,28 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing= + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, false) + +/* + * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a + * higher number of hinting faults are recorded during active load + * balancing. + */ +SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) + +/* + * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a + * lower number of hinting faults have been recorded. As this has + * the potential to prevent a task ever migrating to a new node + * due to CPU overload it is disabled by default. + */ +SCHED_FEAT(NUMA_RESIST_LOWER, false) +#endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 00000000000..cf009fb0bc2 --- /dev/null +++ b/kernel/sched/idle.c @@ -0,0 +1,273 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/stackprotector.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +#include "sched.h" + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + rcu_idle_enter(); + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!tif_need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; + local_irq_enable(); +} + +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * + * On archs that support TIF_POLLING_NRFLAG, is called with polling + * set, and it returns with polling set. If it ever stops polling, it + * must clear the polling bit. + */ +static void cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int next_state, entered_state; + bool broadcast; + + /* + * Check if the idle task must be rescheduled. If it is the + * case, exit the function after re-enabling the local irq. + */ + if (need_resched()) { + local_irq_enable(); + return; + } + + /* + * During the idle period, stop measuring the disabled irqs + * critical sections latencies + */ + stop_critical_timings(); + + /* + * Tell the RCU framework we are entering an idle section, + * so no more rcu read side critical sections and one more + * step to the grace period + */ + rcu_idle_enter(); + + /* + * Ask the cpuidle framework to choose a convenient idle state. + * Fall back to the default arch idle method on errors. + */ + next_state = cpuidle_select(drv, dev); + if (next_state < 0) { +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; + } + + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + goto exit_idle; + } + + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + + /* + * Tell the time framework to switch to a broadcast timer + * because our local timer will be shutdown. If a local timer + * is used from another cpu as a broadcast timer, this call may + * fail if it is not available + */ + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + goto use_default; + + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + entered_state = cpuidle_enter(drv, dev, next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + if (broadcast) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the outcome + */ + cpuidle_reflect(dev, entered_state); + +exit_idle: + __current_set_polling(); + + /* + * It is up to the idle functions to reenable local interrupts + */ + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + + rcu_idle_exit(); + start_critical_timings(); +} + +/* + * Generic idle loop implementation + * + * Called with polling cleared. + */ +static void cpu_idle_loop(void) +{ + while (1) { + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if + * rq->curr != rq->idle). This means that, if rq->idle has + * the polling bit set, then setting need_resched is + * guaranteed to cause the cpu to reschedule. + */ + + __current_set_polling(); + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + + arch_cpu_idle_exit(); + } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending and reschedule + * if need_resched is set while polling is set. That + * means that clearing polling needs to be visible + * before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f28..879f2b75266 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -4,16 +4,17 @@ * idle-task scheduling class. * * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched_fair.c) + * handled in sched/fair.c) */ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } #endif /* CONFIG_SMP */ + /* * Idle tasks are unconditionally rescheduled: */ @@ -22,10 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_task(rq->idle); } -static struct task_struct *pick_next_task_idle(struct rq *rq) +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev) { + put_prev_task(rq, prev); + schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } @@ -44,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 00000000000..16f5a30f9c8 --- /dev/null +++ b/kernel/sched/proc.c @@ -0,0 +1,591 @@ +/* + * kernel/sched/proc.c + * + * Kernel load calculations, forked from sched/core.c + */ + +#include <linux/export.h> + +#include "sched.h" + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_SMP +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->cfs.runnable_load_avg; +} +#else +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->load.weight; +} +#endif + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = get_rq_runnable_load(this_rq); + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = get_rq_runnable_load(this_rq); + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); + + calc_load_account_active(this_rq); +} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec..a49083192c6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include <linux/slab.h> +int sched_rr_timeslice = RR_TIMESLICE; + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; @@ -77,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; @@ -110,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_se->rt_rq; + + return rt_rq->rq; +} + void free_rt_sched_group(struct task_group *tg) { int i; @@ -209,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) return container_of(rt_rq, struct rq, rt); } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); + + return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct rq *rq = rq_of_rt_se(rt_se); return &rq->rt; } @@ -227,6 +244,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->rt.highest_prio.curr > prev->prio; +} + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -244,8 +269,10 @@ static inline void rt_set_overload(struct rq *rq) * if we should look at the mask. It would be a shame * if we looked at the mask, but the mask was not * updated yet. + * + * Matched by the barrier in pull_rt_task(). */ - wmb(); + smp_wmb(); atomic_inc(&rq->rd->rto_count); } @@ -274,13 +301,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +318,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -305,6 +338,15 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } +static inline void set_post_schedule(struct rq *rq) +{ + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -349,8 +391,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); + static inline int on_rt_rq(struct sched_rt_entity *rt_se) { return !list_empty(&rt_se->run_list); @@ -391,20 +449,6 @@ static inline struct task_group *next_task_group(struct task_group *tg) (iter = next_task_group(iter)) && \ (rt_rq = iter->rt_rq[cpu_of(rq)]);) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_add_rcu(&rt_rq->leaf_rt_rq_list, - &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_del_rcu(&rt_rq->leaf_rt_rq_list); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -426,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) + if (!rt_se) + enqueue_top_rt_rq(rt_rq); + else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } @@ -440,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (rt_se && on_rt_rq(rt_se)) + if (!rt_se) + dequeue_top_rt_rq(rt_rq); + else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); } @@ -464,7 +513,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) #ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_rq(smp_processor_id())->rd->span; + return this_rq()->rd->span; } #else static inline const struct cpumask *sched_rt_period_mask(void) @@ -501,17 +550,6 @@ typedef struct rt_rq *rt_rq_iter_t; #define for_each_rt_rq(rt_rq, iter, rq) \ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = NULL) @@ -522,12 +560,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_task(rq->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { + dequeue_top_rt_rq(rt_rq); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -553,6 +597,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. @@ -560,7 +612,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; int i, weight, more = 0; u64 rt_period; @@ -685,20 +737,12 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } } -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static void __enable_runtime(struct rq *rq) { rt_rq_iter_t iter; @@ -723,37 +767,6 @@ static void __enable_runtime(struct rq *rq) } } -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; @@ -778,13 +791,23 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1; + int i, idle = 1, throttled = 0; const struct cpumask *span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); @@ -818,12 +841,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (!rt_rq_throttled(rt_rq)) enqueue = 1; } + if (rt_rq->rt_throttled) + throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + return idle; } @@ -855,8 +883,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + if (likely(rt_b->rt_runtime)) { + rt_rq->rt_throttled = 1; + printk_deferred_once("sched: RT throttling activated\n"); + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -874,22 +918,22 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq->clock_task - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec <= 0)) + return; - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -898,7 +942,7 @@ static void update_curr_rt(struct rq *rq) return; for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); @@ -910,6 +954,38 @@ static void update_curr_rt(struct rq *rq) } } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (!rt_rq->rt_queued) + return; + + BUG_ON(!rq->nr_running); + + sub_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (rt_rq->rt_queued) + return; + if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + return; + + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; +} + #if defined CONFIG_SMP static void @@ -917,6 +993,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -926,6 +1009,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -1019,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + if (group_rq) + return group_rq->rt_nr_running; + else + return 1; +} + +static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1036,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); @@ -1059,9 +1160,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; - if (!rt_rq->rt_nr_running) - list_add_leaf_rt_rq(rt_rq); - if (head) list_add(&rt_se->run_list, queue); else @@ -1081,8 +1179,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); - if (!rt_rq->rt_nr_running) - list_del_leaf_rt_rq(rt_rq); } /* @@ -1098,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) back = rt_se; } + dequeue_top_rt_rq(rt_rq_of_se(back)); + for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se); @@ -1106,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, head); + enqueue_top_rt_rq(&rq->rt); } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) { @@ -1121,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) if (rt_rq && rt_rq->rt_nr_running) __enqueue_rt_entity(rt_se, false); } + enqueue_top_rt_rq(&rq->rt); } /* @@ -1136,10 +1240,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1150,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_nr_running(rq); } /* @@ -1192,15 +1292,12 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; - int cpu; - - cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1235,9 +1332,8 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || - curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (curr->nr_cpus_allowed < 2 || + curr->prio <= p->prio)) { int target = find_lowest_rq(p); if (target != -1) @@ -1251,10 +1347,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1321,15 +1417,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1338,26 +1426,48 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; + struct rt_rq *rt_rq = &rq->rt; + + if (need_pull_rt_task(rq, prev)) { + pull_rt_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a dl or stop task can slip in, in which case we need + * to re-start task selection. + */ + if (unlikely((rq->stop && rq->stop->on_rq) || + rq->dl.dl_nr_running)) + return RETRY_TASK; + } + + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + + if (!rt_rq->rt_queued) + return NULL; + + put_prev_task(rq, prev); + + p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ if (p) dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + set_post_schedule(rq); return p; } @@ -1370,7 +1480,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1382,48 +1492,29 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; return 0; } -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +/* + * Return the highest pushable rq's task, which is suitable to be executed + * on the cpu, NULL otherwise + */ +static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { - struct task_struct *next = NULL; - struct sched_rt_entity *rt_se; - struct rt_prio_array *array; - struct rt_rq *rt_rq; - int idx; - - for_each_leaf_rt_rq(rt_rq, rq) { - array = &rt_rq->active; - idx = sched_find_first_bit(array->bitmap); -next_idx: - if (idx >= MAX_RT_PRIO) - continue; - if (next && next->prio < idx) - continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *p; - if (!rt_entity_is_task(rt_se)) - continue; + if (!has_pushable_tasks(rq)) + return NULL; - p = rt_task_of(rt_se); - if (pick_rt_task(rq, p, cpu)) { - next = p; - break; - } - } - if (!next) { - idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - goto next_idx; - } + plist_for_each_entry(p, head, pushable_tasks) { + if (pick_rt_task(rq, p, cpu)) + return p; } - return next; + return NULL; } static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); @@ -1439,7 +1530,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1531,7 +1622,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->on_rq)) { - raw_spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } @@ -1561,7 +1652,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1587,11 +1678,6 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - if (unlikely(task_running(rq, next_task))) - return 0; -#endif - retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); @@ -1677,6 +1763,12 @@ static int pull_rt_task(struct rq *this_rq) if (likely(!rt_overloaded(this_rq))) return 0; + /* + * Match the barrier from rt_set_overloaded; this guarantees that if we + * see overloaded we must also see the rto_mask bit. + */ + smp_rmb(); + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1702,12 +1794,10 @@ static int pull_rt_task(struct rq *this_rq) double_lock_balance(this_rq, src_rq); /* - * Are there still pullable RT tasks? + * We can pull only a task, which is pushable + * on its rq, and no others. */ - if (src_rq->rt.rt_nr_running <= 1) - goto skip; - - p = pick_next_highest_task_rt(src_rq, this_cpu); + p = pick_highest_pushable_task(src_rq, this_cpu); /* * Do we have an RT task that preempts @@ -1747,13 +1837,6 @@ skip: return ret; } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - static void post_schedule_rt(struct rq *rq) { push_rt_tasks(rq); @@ -1768,9 +1851,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && - rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1778,44 +1861,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_rt(struct task_struct *p, const struct cpumask *new_mask) { - int weight = cpumask_weight(new_mask); + struct rq *rq; + int weight; BUG_ON(!rt_task(p)); - /* - * Update the migration status of the RQ if we have an RT task - * which is running AND changing its weight value. - */ - if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { - struct rq *rq = task_rq(p); - - if (!task_current(rq, p)) { - /* - * Make sure we dequeue this task from the pushable list - * before going further. It will either remain off of - * the list because we are no longer pushable, or it - * will be requeued. - */ - if (p->rt.nr_cpus_allowed > 1) - dequeue_pushable_task(rq, p); + if (!p->on_rq) + return; - /* - * Requeue if our weight is changing and still > 1 - */ - if (weight > 1) - enqueue_pushable_task(rq, p); + weight = cpumask_weight(new_mask); - } + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; - if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { - rq->rt.rt_nr_migratory++; - } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } + rq = task_rq(p); - update_rt_migration(&rq->rt); + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_task(rq, p); + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_task(rq, p); + rq->rt.rt_nr_migratory++; } + + update_rt_migration(&rq->rt); } /* Assumes rq->lock is held */ @@ -1853,11 +1932,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->on_rq && !rq->rt.rt_nr_running) - pull_rt_task(rq); + if (!p->on_rq || rq->rt.rt_nr_running) + return; + + if (pull_rt_task(rq)) + resched_task(rq->curr); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; @@ -1886,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ - rq != task_rq(p)) + push_rt_task(rq) && rq != task_rq(p)) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) @@ -1949,7 +2031,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (soft != RLIM_INFINITY) { unsigned long next; - p->rt.timeout++; + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; @@ -1958,6 +2044,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -1972,15 +2060,18 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = DEF_TIMESLICE; + p->rt.time_slice = sched_rr_timeslice; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are not + * the only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } @@ -1988,7 +2079,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); @@ -2000,7 +2091,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return DEF_TIMESLICE; + return sched_rr_timeslice; else return 0; } @@ -2022,7 +2113,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db..31cc02ebc54 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,57 +1,90 @@ #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/tick.h> +#include <linux/slab.h> #include "cpupri.h" +#include "cpudeadline.h" +#include "cpuacct.h" + +struct rq; extern __read_mostly int scheduler_running; -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +extern long calc_load_fold_active(struct rq *this_rq); +extern void update_cpu_load_active(struct rq *this_rq); /* * Helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT /* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE (10) + +/* * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. */ -#define DEF_TIMESLICE (100 * HZ / 1000) /* * single value that denotes runtime == period, ie unlimited time. */ #define RUNTIME_INF ((u64)~0ULL) +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + static inline int rt_policy(int policy) { - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; } static inline int task_has_rt_policy(struct task_struct *p) @@ -59,6 +92,25 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -74,6 +126,47 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + * - store the maximum -deadline bandwidth of the system (the group); + * - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_total_bw array contains, in the i-eth element, the currently + * allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +extern struct dl_bw *dl_bw_of(int i); + +struct dl_bw { + raw_spinlock_t lock; + u64 bw, total_bw; +}; extern struct mutex sched_domains_mutex; @@ -84,7 +177,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH @@ -115,7 +208,10 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; - atomic_t load_weight; +#ifdef CONFIG_SMP + atomic_long_t load_avg; + atomic_t runnable_avg; +#endif #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -154,11 +250,6 @@ struct task_group { #define MAX_SHARES (1UL << 18) #endif -/* Default task group. - * Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; - typedef int (*tg_visitor)(struct task_group *, void *); extern int walk_tg_tree_from(struct task_group *from, @@ -187,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); @@ -196,6 +287,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif + #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; @@ -205,7 +308,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long nr_running, h_nr_running; + unsigned int nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime; @@ -216,9 +319,6 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct list_head tasks; - struct list_head *balance_iterator; - /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). @@ -229,26 +329,22 @@ struct cfs_rq { unsigned int nr_spread_over; #endif -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - #ifdef CONFIG_SMP /* - * the part of load.weight contributed by tasks + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). */ - unsigned long task_weight; + unsigned long runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; + u64 last_decay; + atomic_long_t removed_load; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* Required to track per-cpu representation of a task_group */ + u32 tg_runnable_contrib; + unsigned long tg_load_contrib; /* * h_load = weight * f(tg) @@ -257,26 +353,33 @@ struct cfs_rq { * this group. */ unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* - * Maintaining per-cpu shares distribution for group scheduling + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ @@ -291,7 +394,7 @@ static inline int rt_bandwidth_enabled(void) /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; - unsigned long rt_nr_running; + unsigned int rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ @@ -306,6 +409,8 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + int rt_queued; + int rt_throttled; u64 rt_time; u64 rt_runtime; @@ -316,11 +421,45 @@ struct rt_rq { unsigned long rt_nr_boosted; struct rq *rq; - struct list_head leaf_rt_rq_list; struct task_group *tg; #endif }; +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root pushable_dl_tasks_root; + struct rb_node *pushable_dl_tasks_leftmost; +#else + struct dl_bw dl_bw; +#endif +}; + #ifdef CONFIG_SMP /* @@ -339,6 +478,15 @@ struct root_domain { cpumask_var_t online; /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + + /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ @@ -365,14 +513,21 @@ struct rq { * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ - unsigned long nr_running; + unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; #endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif int skip_clock_update; /* capture load from *all* tasks on this cpu: */ @@ -382,14 +537,14 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif + + struct sched_avg avg; +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * This is part of a global counter where only the total sum @@ -412,7 +567,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; - unsigned long cpu_power; + unsigned long cpu_capacity; unsigned char idle_balance; /* For active balancing */ @@ -424,10 +579,15 @@ struct rq { int cpu; int online; + struct list_head cfs_tasks; + u64 rt_avg; u64 age_stamp; u64 idle_stamp; u64 avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -462,7 +622,6 @@ struct rq { unsigned int yld_count; /* schedule() stats */ - unsigned int sched_switch; unsigned int sched_count; unsigned int sched_goidle; @@ -493,8 +652,26 @@ DECLARE_PER_CPU(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +static inline u64 rq_clock(struct rq *rq) +{ + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + return rq->clock_task; +} + +#ifdef CONFIG_NUMA_BALANCING +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *, struct task_struct *); +#endif /* CONFIG_NUMA_BALANCING */ + #ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); + #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -534,8 +711,87 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) return hsd; } +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain *, sd_numa); +DECLARE_PER_CPU(struct sched_domain *, sd_busy); +DECLARE_PER_CPU(struct sched_domain *, sd_asym); + +struct sched_group_capacity { + atomic_t ref; + /* + * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * for a single CPU. + */ + unsigned int capacity, capacity_orig; + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_capacity *sgc; + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgc->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#else + +static inline void sched_ttwu_pending(void) { } #endif /* CONFIG_SMP */ @@ -547,22 +803,19 @@ DECLARE_PER_CPU(int, sd_llc_id); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -604,6 +857,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); task_thread_info(p)->cpu = cpu; + p->wake_cpu = cpu; #endif } @@ -611,7 +865,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/jump_label.h> +# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +884,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,12 +904,24 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; @@ -669,8 +935,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } - - static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; @@ -692,6 +956,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifndef finish_arch_switch # define finish_arch_switch(prev) do { } while (0) #endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) @@ -742,11 +1009,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) */ next->on_cpu = 1; #endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else raw_spin_unlock(&rq->lock); -#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) @@ -760,30 +1023,16 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->on_cpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); -#endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -838,20 +1087,85 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif +#define ENQUEUE_REPLENISH 8 + +#define DEQUEUE_SLEEP 1 + +#define RETRY_TASK ((void *)-1UL) + +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + /* + * It is the responsibility of the pick_next_task() method that will + * return the next task to call put_prev_task() on the @prev task or + * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. + */ + struct task_struct * (*pick_next_task) (struct rq *rq, + struct task_struct *prev); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + + void (*post_schedule) (struct rq *this_rq); + void (*task_waking) (struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); - CPUACCT_STAT_NSTATS, + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); + void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*task_move_group) (struct task_struct *p, int on_rq); +#endif }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; @@ -859,24 +1173,28 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP -extern void trigger_load_balance(struct rq *rq, int cpu); -extern void idle_balance(int this_cpu, struct rq *this_rq); +extern void update_group_capacity(struct sched_domain *sd, int cpu); -#else /* CONFIG_SMP */ +extern void trigger_load_balance(struct rq *rq); -static inline void idle_balance(int cpu, struct rq *rq) -{ -} +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); + +#else + +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { } #endif extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern void update_group_power(struct sched_domain *sd, int cpu); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); + +extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void init_sched_dl_class(void); extern void resched_task(struct task_struct *p); extern void resched_cpu(int cpu); @@ -884,52 +1202,43 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -extern void update_cpu_load(struct rq *this_rq); +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; +unsigned long to_ratio(u64 period, u64 runtime); -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} +extern void update_idle_cpu_load(struct rq *this_rq); -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} +extern void init_task_runnable_average(struct task_struct *p); -static inline struct cpuacct *parent_ca(struct cpuacct *ca) +static inline void add_nr_running(struct rq *rq, unsigned count) { - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} + unsigned prev_nr = rq->nr_running; -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} + rq->nr_running = prev_nr + count; + +#ifdef CONFIG_NO_HZ_FULL + if (prev_nr < 2 && rq->nr_running >= 2) { + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } + } #endif +} -static inline void inc_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count) { - rq->nr_running++; + rq->nr_running -= count; } -static inline void dec_nr_running(struct rq *rq) +static inline void rq_last_tick_reset(struct rq *rq) { - rq->nr_running--; +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif } extern void update_rq_clock(struct rq *rq); @@ -948,8 +1257,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* @@ -1067,6 +1374,33 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + /* * double_rq_lock - safely lock two runqueues * @@ -1151,16 +1485,65 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); +extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); -extern void account_cfs_bandwidth_used(int enabled, int was_enabled); +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e19..a476bea17fb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -21,20 +21,23 @@ static int show_schedstat(struct seq_file *seq, void *v) if (mask_str == NULL) return -ENOMEM; - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + if (v == (void *)1) { + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + } else { + struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", + "cpu%d %u 0 %u %u %u %u %llu %llu %lu", cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); @@ -77,30 +80,61 @@ static int show_schedstat(struct seq_file *seq, void *v) return 0; } -static int schedstat_open(struct inode *inode, struct file *file) +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset) { - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; + unsigned long n = *offset; - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { + .start = schedstat_start, + .next = schedstat_next, + .stop = schedstat_stop, + .show = show_schedstat, +}; + +static int schedstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &schedstat_sops); } static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release, }; static int __init proc_schedstat_init(void) @@ -108,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5..4ab70433965 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. */ -static inline void sched_info_dequeued(struct task_struct *t) +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; - rq_sched_info_dequeued(task_rq(t), delta); + rq_sched_info_dequeued(rq, delta); } /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ -static void sched_info_arrive(struct task_struct *t) +static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; - rq_sched_info_arrive(task_rq(t), delta); + rq_sched_info_arrive(rq, delta); } /* @@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t) * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct task_struct *t) +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = task_rq(t)->clock; + t->sched_info.last_queued = rq_clock(rq); } /* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ -static inline void sched_info_depart(struct task_struct *t) +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = task_rq(t)->clock - + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; - rq_sched_info_depart(task_rq(t), delta); + rq_sched_info_depart(rq, delta); if (t->state == TASK_RUNNING) - sched_info_queued(t); + sched_info_queued(rq, t); } /* @@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { - struct rq *rq = task_rq(prev); - /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle) - sched_info_depart(prev); + sched_info_depart(rq, prev); if (next != rq->idle) - sched_info_arrive(next); + sched_info_arrive(rq, next); } static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); + __sched_info_switch(rq, prev, next); } #else -#define sched_info_queued(t) do { } while (0) +#define sched_info_queued(rq, t) do { } while (0) #define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_dequeued(rq, t) do { } while (0) +#define sched_info_depart(rq, t) do { } while (0) +#define sched_info_arrive(rq, next) do { } while (0) +#define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* @@ -162,6 +165,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) */ /** + * cputimer_running - return true if cputimer is running + * + * @tsk: Pointer to target task. + */ +static inline bool cputimer_running(struct task_struct *tsk) + +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return false; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return false; + + return true; +} + +/** * account_group_user_time - Maintain utime for a thread group. * * @tsk: Pointer to task structure. @@ -176,7 +212,7 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -199,7 +235,7 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -222,7 +258,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd2..bfe0edadbfb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -23,26 +23,31 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct * +pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) - return stop; + if (!stop || !stop->on_rq) + return NULL; - return NULL; + put_prev_task(rq, prev); + + stop->se.exec_start = rq_clock_task(rq); + + return stop; } static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - inc_nr_running(rq); + add_nr_running(rq, 1); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - dec_nr_running(rq); + sub_nr_running(rq, 1); } static void yield_task_stop(struct rq *rq) @@ -52,6 +57,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +80,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq_clock_task(rq); } static void switched_to_stop(struct rq *rq, struct task_struct *p) @@ -83,7 +106,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { - .next = &rt_sched_class, + .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c new file mode 100644 index 00000000000..0ffa20ae657 --- /dev/null +++ b/kernel/sched/wait.c @@ -0,0 +1,504 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 Nadia Yvette Chambers, Oracle + */ +#include <linux/init.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/wait.h> +#include <linux/hash.h> + +void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +{ + spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(__init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ + __wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + int wake_flags = 1; /* XXX WF_SYNC */ + + if (unlikely(!q)) + return; + + if (unlikely(nr_exclusive != 1)) + wake_flags = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + wait->private = current; + wait->func = autoremove_wake_function; + + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); + + return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); + +/** + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +/** + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @mode: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and no one wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_locked_key(q, mode, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + do { + int ret; + + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_atomic(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); |
