diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 6 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 2 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 110 | ||||
| -rw-r--r-- | kernel/sched/completion.c | 299 | ||||
| -rw-r--r-- | kernel/sched/core.c | 2376 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.c | 26 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 229 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.h | 33 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 16 | ||||
| -rw-r--r-- | kernel/sched/cpupri.h | 2 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 52 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 1676 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 76 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 2602 | ||||
| -rw-r--r-- | kernel/sched/features.h | 27 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 273 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 27 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 251 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 307 | ||||
| -rw-r--r-- | kernel/sched/stats.c | 2 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 46 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 23 | ||||
| -rw-r--r-- | kernel/sched/wait.c | 504 | 
23 files changed, 7209 insertions, 1756 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f49..ab32b7b0db5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,8 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)  CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer  endif -obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o -obj-$(CONFIG_SMP) += cpupri.o +obj-y += core.o proc.o clock.o cputime.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += wait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o  obj-$(CONFIG_SCHEDSTATS) += stats.o  obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 4a073539c58..e73efba9830 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)  	struct autogroup *ag;  	int err; -	if (nice < -20 || nice > 19) +	if (nice < MIN_NICE || nice > MAX_NICE)  		return -EINVAL;  	err = security_task_setnice(current, nice); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461..3ef6451e972 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -26,9 +26,10 @@   * at 0 on boot (but people really shouldn't rely on that).   *   * cpu_clock(i)       -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)   * local_clock()      -- is cpu_clock() on the current cpu.   * + * sched_clock_cpu(i) + *   * How:   *   * The implementation either uses sched_clock() when @@ -50,15 +51,6 @@   * Furthermore, explicit sleep and wakeup hooks allow us to account for time   * that is otherwise invisible (TSC gets stopped).   * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock().   */  #include <linux/spinlock.h>  #include <linux/hardirq.h> @@ -66,13 +58,16 @@  #include <linux/percpu.h>  #include <linux/ktime.h>  #include <linux/sched.h> +#include <linux/static_key.h> +#include <linux/workqueue.h> +#include <linux/compiler.h>  /*   * Scheduler clock - returns current time in nanosec units.   * This is default implementation.   * Architectures and sub-architectures can override this.   */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void)  {  	return (unsigned long long)(jiffies - INITIAL_JIFFIES)  					* (NSEC_PER_SEC / HZ); @@ -82,7 +77,52 @@ EXPORT_SYMBOL_GPL(sched_clock);  __read_mostly int sched_clock_running;  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +static struct static_key __sched_clock_stable = STATIC_KEY_INIT; +static int __sched_clock_stable_early; + +int sched_clock_stable(void) +{ +	return static_key_false(&__sched_clock_stable); +} + +static void __set_sched_clock_stable(void) +{ +	if (!sched_clock_stable()) +		static_key_slow_inc(&__sched_clock_stable); +} + +void set_sched_clock_stable(void) +{ +	__sched_clock_stable_early = 1; + +	smp_mb(); /* matches sched_clock_init() */ + +	if (!sched_clock_running) +		return; + +	__set_sched_clock_stable(); +} + +static void __clear_sched_clock_stable(struct work_struct *work) +{ +	/* XXX worry about clock continuity */ +	if (sched_clock_stable()) +		static_key_slow_dec(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + +void clear_sched_clock_stable(void) +{ +	__sched_clock_stable_early = 0; + +	smp_mb(); /* matches sched_clock_init() */ + +	if (!sched_clock_running) +		return; + +	schedule_work(&sched_clock_work); +}  struct sched_clock_data {  	u64			tick_raw; @@ -116,6 +156,20 @@ void sched_clock_init(void)  	}  	sched_clock_running = 1; + +	/* +	 * Ensure that it is impossible to not do a static_key update. +	 * +	 * Either {set,clear}_sched_clock_stable() must see sched_clock_running +	 * and do the update, or we must see their __sched_clock_stable_early +	 * and do the update, or both. +	 */ +	smp_mb(); /* matches {set,clear}_sched_clock_stable() */ + +	if (__sched_clock_stable_early) +		__set_sched_clock_stable(); +	else +		__clear_sched_clock_stable(NULL);  }  /* @@ -242,20 +296,20 @@ u64 sched_clock_cpu(int cpu)  	struct sched_clock_data *scd;  	u64 clock; -	WARN_ON_ONCE(!irqs_disabled()); - -	if (sched_clock_stable) +	if (sched_clock_stable())  		return sched_clock();  	if (unlikely(!sched_clock_running))  		return 0ull; +	preempt_disable_notrace();  	scd = cpu_sdc(cpu);  	if (cpu != smp_processor_id())  		clock = sched_clock_remote(scd);  	else  		clock = sched_clock_local(scd); +	preempt_enable_notrace();  	return clock;  } @@ -265,7 +319,7 @@ void sched_clock_tick(void)  	struct sched_clock_data *scd;  	u64 now, now_gtod; -	if (sched_clock_stable) +	if (sched_clock_stable())  		return;  	if (unlikely(!sched_clock_running)) @@ -316,14 +370,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);   */  u64 cpu_clock(int cpu)  { -	u64 clock; -	unsigned long flags; - -	local_irq_save(flags); -	clock = sched_clock_cpu(cpu); -	local_irq_restore(flags); +	if (!sched_clock_stable()) +		return sched_clock_cpu(cpu); -	return clock; +	return sched_clock();  }  /* @@ -335,14 +385,10 @@ u64 cpu_clock(int cpu)   */  u64 local_clock(void)  { -	u64 clock; -	unsigned long flags; +	if (!sched_clock_stable()) +		return sched_clock_cpu(raw_smp_processor_id()); -	local_irq_save(flags); -	clock = sched_clock_cpu(smp_processor_id()); -	local_irq_restore(flags); - -	return clock; +	return sched_clock();  }  #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ @@ -362,12 +408,12 @@ u64 sched_clock_cpu(int cpu)  u64 cpu_clock(int cpu)  { -	return sched_clock_cpu(cpu); +	return sched_clock();  }  u64 local_clock(void)  { -	return sched_clock_cpu(0); +	return sched_clock();  }  #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 00000000000..a63f4dc2790 --- /dev/null +++ b/kernel/sched/completion.c @@ -0,0 +1,299 @@ +/* + * Generic wait-for-completion handler; + * + * It differs from semaphores in that their default case is the opposite, + * wait_for_completion default blocks whereas semaphore default non-block. The + * interface also makes it easy to 'complete' multiple waiting threads, + * something which isn't entirely natural for semaphores. + * + * But more importantly, the primitive documents the usage. Semaphores would + * typically be used for exclusion which gives rise to priority inversion. + * Waiting for completion is a typically sync point, but not an exclusion point. + */ + +#include <linux/sched.h> +#include <linux/completion.h> + +/** + * complete: - signals a single thread waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done++; +	__wake_up_locked(&x->wait, TASK_NORMAL, 1); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done += UINT_MAX/2; +	__wake_up_locked(&x->wait, TASK_NORMAL, 0); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, +		   long (*action)(long), long timeout, int state) +{ +	if (!x->done) { +		DECLARE_WAITQUEUE(wait, current); + +		__add_wait_queue_tail_exclusive(&x->wait, &wait); +		do { +			if (signal_pending_state(state, current)) { +				timeout = -ERESTARTSYS; +				break; +			} +			__set_current_state(state); +			spin_unlock_irq(&x->wait.lock); +			timeout = action(timeout); +			spin_lock_irq(&x->wait.lock); +		} while (!x->done && timeout); +		__remove_wait_queue(&x->wait, &wait); +		if (!x->done) +			return timeout; +	} +	x->done--; +	return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, +		  long (*action)(long), long timeout, int state) +{ +	might_sleep(); + +	spin_lock_irq(&x->wait.lock); +	timeout = do_wait_for_common(x, action, timeout, state); +	spin_unlock_irq(&x->wait.lock); +	return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ +	return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ +	return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ +	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ +	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ +	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x:  holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, +					  unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, +				     unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + *	try_wait_for_completion - try to decrement a completion without blocking + *	@x:	completion structure + * + *	Return: 0 if a decrement cannot be done without blocking + *		 1 if a decrement succeeded. + * + *	If a completion is being used as a counting completion, + *	attempt to decrement the counter without blocking. This + *	enables us to avoid waiting if the resource the completion + *	is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	else +		x->done--; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + *	completion_done - Test to see if a completion has any waiters + *	@x:	completion structure + * + *	Return: 0 if there are waiters (wait_for_completion() in progress) + *		 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995..bc1638b3344 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@  #include <linux/init_task.h>  #include <linux/binfmts.h>  #include <linux/context_tracking.h> +#include <linux/compiler.h>  #include <asm/switch_to.h>  #include <asm/tlb.h> @@ -89,6 +90,22 @@  #define CREATE_TRACE_POINTS  #include <trace/events/sched.h> +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ +	smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ +	smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif +  void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)  {  	unsigned long delta; @@ -296,8 +313,6 @@ __read_mostly int scheduler_running;   */  int sysctl_sched_rt_runtime = 950000; - -  /*   * __task_rq_lock - lock the rq @p resides on.   */ @@ -434,7 +449,7 @@ void hrtick_start(struct rq *rq, u64 delay)  	if (rq == this_rq()) {  		__hrtick_restart(rq);  	} else if (!rq->hrtick_csd_pending) { -		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); +		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);  		rq->hrtick_csd_pending = 1;  	}  } @@ -507,32 +522,98 @@ static inline void init_hrtick(void)  #endif	/* CONFIG_SCHED_HRTICK */  /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val)						\ +({	typeof(*(ptr)) __old, __val = *(ptr);				\ + 	for (;;) {							\ + 		__old = cmpxchg((ptr), __val, __val | (val));		\ + 		if (__old == __val)					\ + 			break;						\ + 		__val = __old;						\ + 	}								\ + 	__old;								\ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + +	for (;;) { +		if (!(val & _TIF_POLLING_NRFLAG)) +			return false; +		if (val & _TIF_NEED_RESCHED) +			return true; +		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); +		if (old == val) +			break; +		val = old; +	} +	return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	set_tsk_need_resched(p); +	return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ +	return false; +} +#endif +#endif + +/*   * resched_task - mark a task 'to be rescheduled now'.   *   * On UP this means the setting of the need_resched flag, on SMP it   * might also involve a cross-CPU call to trigger the scheduler on   * the target CPU.   */ -#ifdef CONFIG_SMP  void resched_task(struct task_struct *p)  {  	int cpu; -	assert_raw_spin_locked(&task_rq(p)->lock); +	lockdep_assert_held(&task_rq(p)->lock);  	if (test_tsk_need_resched(p))  		return; -	set_tsk_need_resched(p); -  	cpu = task_cpu(p); -	if (cpu == smp_processor_id()) + +	if (cpu == smp_processor_id()) { +		set_tsk_need_resched(p); +		set_preempt_need_resched();  		return; +	} -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(p)) +	if (set_nr_and_not_polling(p))  		smp_send_reschedule(cpu); +	else +		trace_sched_wake_idle_without_ipi(cpu);  }  void resched_cpu(int cpu) @@ -546,6 +627,7 @@ void resched_cpu(int cpu)  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } +#ifdef CONFIG_SMP  #ifdef CONFIG_NO_HZ_COMMON  /*   * In the semi idle case, use the nearest busy cpu for migrating timers @@ -555,12 +637,15 @@ void resched_cpu(int cpu)   * selecting an idle cpu will add more delays to the timers than intended   * (as that cpu's timer base may not be uptodate wrt jiffies etc).   */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned)  {  	int cpu = smp_processor_id();  	int i;  	struct sched_domain *sd; +	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) +		return cpu; +  	rcu_read_lock();  	for_each_domain(cpu, sd) {  		for_each_cpu(i, sched_domain_span(sd)) { @@ -591,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)  	if (cpu == smp_processor_id())  		return; -	/* -	 * This is safe, as this function is called with the timer -	 * wheel base lock of (cpu) held. When the CPU is on the way -	 * to idle and has not yet set rq->curr to idle then it will -	 * be serialized on the timer wheel base lock and take the new -	 * timer into account automatically. -	 */ -	if (rq->curr != rq->idle) -		return; - -	/* -	 * We can set TIF_RESCHED on the idle task of the other CPU -	 * lockless. The worst case is that the other CPU runs the -	 * idle task through an additional NOOP schedule() -	 */ -	set_tsk_need_resched(rq->idle); - -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(rq->idle)) +	if (set_nr_and_not_polling(rq->idle))  		smp_send_reschedule(cpu); +	else +		trace_sched_wake_idle_without_ipi(cpu);  }  static bool wake_up_full_nohz_cpu(int cpu) @@ -693,12 +761,6 @@ void sched_avg_update(struct rq *rq)  	}  } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ -	assert_raw_spin_locked(&task_rq(p)->lock); -	set_tsk_need_resched(p); -}  #endif /* CONFIG_SMP */  #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +829,14 @@ static void set_load_weight(struct task_struct *p)  static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)  {  	update_rq_clock(rq); -	sched_info_queued(p); +	sched_info_queued(rq, p);  	p->sched_class->enqueue_task(rq, p, flags);  }  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)  {  	update_rq_clock(rq); -	sched_info_dequeued(p); +	sched_info_dequeued(rq, p);  	p->sched_class->dequeue_task(rq, p, flags);  } @@ -829,19 +891,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  #endif  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING  	if (static_key_false((¶virt_steal_rq_enabled))) { -		u64 st; -  		steal = paravirt_steal_clock(cpu_of(rq));  		steal -= rq->prev_steal_time_rq;  		if (unlikely(steal > delta))  			steal = delta; -		st = steal_ticks(steal); -		steal = st * TICK_NSEC; -  		rq->prev_steal_time_rq += steal; -  		delta -= steal;  	}  #endif @@ -849,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  	rq->clock_task += delta;  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -	if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) +	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))  		sched_rt_avg_update(rq, irq_delta + steal);  #endif  } @@ -903,7 +959,9 @@ static inline int normal_prio(struct task_struct *p)  {  	int prio; -	if (task_has_rt_policy(p)) +	if (task_has_dl_policy(p)) +		prio = MAX_DL_PRIO-1; +	else if (task_has_rt_policy(p))  		prio = MAX_RT_PRIO-1 - p->rt_priority;  	else  		prio = __normal_prio(p); @@ -949,7 +1007,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,  		if (prev_class->switched_from)  			prev_class->switched_from(rq, p);  		p->sched_class->switched_to(rq, p); -	} else if (oldprio != p->prio) +	} else if (oldprio != p->prio || dl_task(p))  		p->sched_class->prio_changed(rq, p, oldprio);  } @@ -987,7 +1045,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	 * ttwu() will sort out the placement.  	 */  	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +			!(task_preempt_count(p) & PREEMPT_ACTIVE));  #ifdef CONFIG_LOCKDEP  	/* @@ -1017,6 +1075,108 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	__set_task_cpu(p, new_cpu);  } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ +	if (p->on_rq) { +		struct rq *src_rq, *dst_rq; + +		src_rq = task_rq(p); +		dst_rq = cpu_rq(cpu); + +		deactivate_task(src_rq, p, 0); +		set_task_cpu(p, cpu); +		activate_task(dst_rq, p, 0); +		check_preempt_curr(dst_rq, p, 0); +	} else { +		/* +		 * Task isn't running anymore; make it appear like we migrated +		 * it before it went to sleep. This means on wakeup we make the +		 * previous cpu our targer instead of where it really is. +		 */ +		p->wake_cpu = cpu; +	} +} + +struct migration_swap_arg { +	struct task_struct *src_task, *dst_task; +	int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ +	struct migration_swap_arg *arg = data; +	struct rq *src_rq, *dst_rq; +	int ret = -EAGAIN; + +	src_rq = cpu_rq(arg->src_cpu); +	dst_rq = cpu_rq(arg->dst_cpu); + +	double_raw_lock(&arg->src_task->pi_lock, +			&arg->dst_task->pi_lock); +	double_rq_lock(src_rq, dst_rq); +	if (task_cpu(arg->dst_task) != arg->dst_cpu) +		goto unlock; + +	if (task_cpu(arg->src_task) != arg->src_cpu) +		goto unlock; + +	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) +		goto unlock; + +	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) +		goto unlock; + +	__migrate_swap_task(arg->src_task, arg->dst_cpu); +	__migrate_swap_task(arg->dst_task, arg->src_cpu); + +	ret = 0; + +unlock: +	double_rq_unlock(src_rq, dst_rq); +	raw_spin_unlock(&arg->dst_task->pi_lock); +	raw_spin_unlock(&arg->src_task->pi_lock); + +	return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ +	struct migration_swap_arg arg; +	int ret = -EINVAL; + +	arg = (struct migration_swap_arg){ +		.src_task = cur, +		.src_cpu = task_cpu(cur), +		.dst_task = p, +		.dst_cpu = task_cpu(p), +	}; + +	if (arg.src_cpu == arg.dst_cpu) +		goto out; + +	/* +	 * These three tests are all lockless; this is OK since all of them +	 * will be re-checked with proper locks held further down the line. +	 */ +	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) +		goto out; + +	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) +		goto out; + +	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) +		goto out; + +	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); +	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: +	return ret; +} +  struct migration_arg {  	struct task_struct *task;  	int dest_cpu; @@ -1224,7 +1384,7 @@ out:  		 * leave kernel.  		 */  		if (p->mm && printk_ratelimit()) { -			printk_sched("process %d (%s) no longer affine to cpu%d\n", +			printk_deferred("process %d (%s) no longer affine to cpu%d\n",  					task_pid_nr(p), p->comm, cpu);  		}  	} @@ -1236,9 +1396,9 @@ out:   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.   */  static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)  { -	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); +	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);  	/*  	 * In order not to call set_task_cpu() on a blocking task we need @@ -1330,12 +1490,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  	if (rq->idle_stamp) {  		u64 delta = rq_clock(rq) - rq->idle_stamp; -		u64 max = 2*sysctl_sched_migration_cost; +		u64 max = 2*rq->max_idle_balance_cost; + +		update_avg(&rq->avg_idle, delta); -		if (delta > max) +		if (rq->avg_idle > max)  			rq->avg_idle = max; -		else -			update_avg(&rq->avg_idle, delta); +  		rq->idle_stamp = 0;  	}  #endif @@ -1377,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)  }  #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void)  {  	struct rq *rq = this_rq();  	struct llist_node *llist = llist_del_all(&rq->wake_list);  	struct task_struct *p; +	unsigned long flags; -	raw_spin_lock(&rq->lock); +	if (!llist) +		return; + +	raw_spin_lock_irqsave(&rq->lock, flags);  	while (llist) {  		p = llist_entry(llist, struct task_struct, wake_entry); @@ -1391,11 +1556,18 @@ static void sched_ttwu_pending(void)  		ttwu_do_activate(rq, p, 0);  	} -	raw_spin_unlock(&rq->lock); +	raw_spin_unlock_irqrestore(&rq->lock, flags);  }  void scheduler_ipi(void)  { +	/* +	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting +	 * TIF_NEED_RESCHED remotely (for the first time) will also send +	 * this IPI. +	 */ +	preempt_fold_need_resched(); +  	if (llist_empty(&this_rq()->wake_list)  			&& !tick_nohz_full_cpu(smp_processor_id())  			&& !got_nohz_idle_kick()) @@ -1430,8 +1602,14 @@ void scheduler_ipi(void)  static void ttwu_queue_remote(struct task_struct *p, int cpu)  { -	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) -		smp_send_reschedule(cpu); +	struct rq *rq = cpu_rq(cpu); + +	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { +		if (!set_nr_if_polling(rq->idle)) +			smp_send_reschedule(cpu); +		else +			trace_sched_wake_idle_without_ipi(cpu); +	}  }  bool cpus_share_cache(int this_cpu, int that_cpu) @@ -1513,7 +1691,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	if (p->sched_class->task_waking)  		p->sched_class->task_waking(p); -	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);  	if (task_cpu(p) != cpu) {  		wake_flags |= WF_MIGRATED;  		set_task_cpu(p, cpu); @@ -1595,7 +1773,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)   *   * __sched_fork() is basic setup used by init_idle() too:   */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	p->on_rq			= 0; @@ -1611,6 +1789,13 @@ static void __sched_fork(struct task_struct *p)  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif +	RB_CLEAR_NODE(&p->dl.rb_node); +	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	p->dl.dl_runtime = p->dl.runtime = 0; +	p->dl.dl_deadline = p->dl.deadline = 0; +	p->dl.dl_period = 0; +	p->dl.flags = 0; +  	INIT_LIST_HEAD(&p->rt.run_list);  #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1619,16 +1804,26 @@ static void __sched_fork(struct task_struct *p)  #ifdef CONFIG_NUMA_BALANCING  	if (p->mm && atomic_read(&p->mm->mm_users) == 1) { -		p->mm->numa_next_scan = jiffies; -		p->mm->numa_next_reset = jiffies; +		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);  		p->mm->numa_scan_seq = 0;  	} +	if (clone_flags & CLONE_VM) +		p->numa_preferred_nid = current->numa_preferred_nid; +	else +		p->numa_preferred_nid = -1; +  	p->node_stamp = 0ULL;  	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; -	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;  	p->numa_scan_period = sysctl_numa_balancing_scan_delay;  	p->numa_work.next = &p->numa_work; +	p->numa_faults_memory = NULL; +	p->numa_faults_buffer_memory = NULL; +	p->last_task_numa_placement = 0; +	p->last_sum_exec_runtime = 0; + +	INIT_LIST_HEAD(&p->numa_entry); +	p->numa_group = NULL;  #endif /* CONFIG_NUMA_BALANCING */  } @@ -1649,17 +1844,39 @@ void set_numabalancing_state(bool enabled)  	numabalancing_enabled = enabled;  }  #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct ctl_table t; +	int err; +	int state = numabalancing_enabled; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	t = *table; +	t.data = &state; +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); +	if (err < 0) +		return err; +	if (write) +		set_numabalancing_state(state); +	return err; +} +#endif +#endif  /*   * fork()/clone()-time setup:   */ -void sched_fork(struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	unsigned long flags;  	int cpu = get_cpu(); -	__sched_fork(p); +	__sched_fork(clone_flags, p);  	/*  	 * We mark the process as running here. This guarantees that  	 * nobody will actually run it, and a signal or other external @@ -1676,7 +1893,7 @@ void sched_fork(struct task_struct *p)  	 * Revert to default priority/policy on fork if requested.  	 */  	if (unlikely(p->sched_reset_on_fork)) { -		if (task_has_rt_policy(p)) { +		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  			p->policy = SCHED_NORMAL;  			p->static_prio = NICE_TO_PRIO(0);  			p->rt_priority = 0; @@ -1693,8 +1910,14 @@ void sched_fork(struct task_struct *p)  		p->sched_reset_on_fork = 0;  	} -	if (!rt_prio(p->prio)) +	if (dl_prio(p->prio)) { +		put_cpu(); +		return -EAGAIN; +	} else if (rt_prio(p->prio)) { +		p->sched_class = &rt_sched_class; +	} else {  		p->sched_class = &fair_sched_class; +	}  	if (p->sched_class->task_fork)  		p->sched_class->task_fork(p); @@ -1717,17 +1940,127 @@ void sched_fork(struct task_struct *p)  #if defined(CONFIG_SMP)  	p->on_cpu = 0;  #endif -#ifdef CONFIG_PREEMPT_COUNT -	/* Want to start with kernel preemption disabled. */ -	task_thread_info(p)->preempt_count = 1; -#endif +	init_task_preempt_count(p);  #ifdef CONFIG_SMP  	plist_node_init(&p->pushable_tasks, MAX_PRIO); +	RB_CLEAR_NODE(&p->pushable_dl_tasks);  #endif  	put_cpu(); +	return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ +	if (runtime == RUNTIME_INF) +		return 1ULL << 20; + +	/* +	 * Doing this here saves a lot of checks in all +	 * the calling paths, and returning zero seems +	 * safe for them anyway. +	 */ +	if (period == 0) +		return 0; + +	return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ +	return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ +	struct root_domain *rd = cpu_rq(i)->rd; +	int cpus = 0; + +	for_each_cpu_and(i, rd->span, cpu_active_mask) +		cpus++; + +	return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ +	return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ +	return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ +	dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ +	dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ +	return dl_b->bw != -1 && +	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, +		       const struct sched_attr *attr) +{ + +	struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); +	u64 period = attr->sched_period ?: attr->sched_deadline; +	u64 runtime = attr->sched_runtime; +	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; +	int cpus, err = -1; + +	if (new_bw == p->dl.dl_bw) +		return 0; + +	/* +	 * Either if a task, enters, leave, or stays -deadline but changes +	 * its parameters, we may need to update accordingly the total +	 * allocated bandwidth of the container. +	 */ +	raw_spin_lock(&dl_b->lock); +	cpus = dl_bw_cpus(task_cpu(p)); +	if (dl_policy(policy) && !task_has_dl_policy(p) && +	    !__dl_overflow(dl_b, cpus, 0, new_bw)) { +		__dl_add(dl_b, new_bw); +		err = 0; +	} else if (dl_policy(policy) && task_has_dl_policy(p) && +		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { +		__dl_clear(dl_b, p->dl.dl_bw); +		__dl_add(dl_b, new_bw); +		err = 0; +	} else if (!dl_policy(policy) && task_has_dl_policy(p)) { +		__dl_clear(dl_b, p->dl.dl_bw); +		err = 0; +	} +	raw_spin_unlock(&dl_b->lock); + +	return err;  } +extern void init_dl_bw(struct dl_bw *dl_b); +  /*   * wake_up_new_task - wake up a newly created task for the first time.   * @@ -1747,7 +2080,7 @@ void wake_up_new_task(struct task_struct *p)  	 *  - cpus_allowed can change in the fork path  	 *  - any previously selected cpu might disappear through hotplug  	 */ -	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif  	/* Initialize new task's runnable average */ @@ -1838,7 +2171,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,  		    struct task_struct *next)  {  	trace_sched_switch(prev, next); -	sched_info_switch(prev, next); +	sched_info_switch(rq, prev, next);  	perf_event_task_sched_out(prev, next);  	fire_sched_out_preempt_notifiers(prev, next);  	prepare_lock_switch(rq, next); @@ -1890,6 +2223,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	if (mm)  		mmdrop(mm);  	if (unlikely(prev_state == TASK_DEAD)) { +		if (prev->sched_class->task_dead) +			prev->sched_class->task_dead(prev); +  		/*  		 * Remove function-return probe instances associated with this  		 * task and put them back on the free list. @@ -1903,13 +2239,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ -	if (prev->sched_class->pre_schedule) -		prev->sched_class->pre_schedule(rq, prev); -} -  /* rq->lock is NOT held, but preemption is disabled */  static inline void post_schedule(struct rq *rq)  { @@ -1927,10 +2256,6 @@ static inline void post_schedule(struct rq *rq)  #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} -  static inline void post_schedule(struct rq *rq)  {  } @@ -1941,7 +2266,7 @@ static inline void post_schedule(struct rq *rq)   * schedule_tail - first thing a freshly forked thread must call.   * @prev: the thread we just switched away from.   */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev)  	__releases(rq->lock)  {  	struct rq *rq = this_rq(); @@ -2073,7 +2398,7 @@ void sched_exec(void)  	int dest_cpu;  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); +	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);  	if (dest_cpu == smp_processor_id())  		goto unlock; @@ -2140,6 +2465,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)  	struct rq *rq;  	u64 ns = 0; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +	/* +	 * 64-bit doesn't need locks to atomically read a 64bit value. +	 * So we have a optimization chance when the task's delta_exec is 0. +	 * Reading ->on_cpu is racy, but this is ok. +	 * +	 * If we race with it leaving cpu, we'll take a lock. So we're correct. +	 * If we race with it entering cpu, unaccounted time is 0. This is +	 * indistinguishable from the read occurring a few cycles earlier. +	 */ +	if (!p->on_cpu) +		return p->se.sum_exec_runtime; +#endif +  	rq = task_rq_lock(p, &flags);  	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);  	task_rq_unlock(rq, p, &flags); @@ -2169,7 +2508,7 @@ void scheduler_tick(void)  #ifdef CONFIG_SMP  	rq->idle_balance = idle_cpu(cpu); -	trigger_load_balance(rq, cpu); +	trigger_load_balance(rq);  #endif  	rq_last_tick_reset(rq);  } @@ -2198,7 +2537,7 @@ u64 scheduler_tick_max_deferment(void)  	if (time_before_eq(next, now))  		return 0; -	return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +	return jiffies_to_nsecs(next - now);  }  #endif @@ -2215,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void preempt_count_add(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT  	/* @@ -2224,7 +2563,7 @@ void __kprobes add_preempt_count(int val)  	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))  		return;  #endif -	preempt_count() += val; +	__preempt_count_add(val);  #ifdef CONFIG_DEBUG_PREEMPT  	/*  	 * Spinlock count overflowing soon? @@ -2232,12 +2571,18 @@ void __kprobes add_preempt_count(int val)  	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=  				PREEMPT_MASK - 10);  #endif -	if (preempt_count() == val) -		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +	if (preempt_count() == val) { +		unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT +		current->preempt_disable_ip = ip; +#endif +		trace_preempt_off(CALLER_ADDR0, ip); +	}  } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void preempt_count_sub(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT  	/* @@ -2255,9 +2600,10 @@ void __kprobes sub_preempt_count(int val)  	if (preempt_count() == val)  		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -	preempt_count() -= val; +	__preempt_count_sub(val);  } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub);  #endif @@ -2276,6 +2622,13 @@ static noinline void __schedule_bug(struct task_struct *prev)  	print_modules();  	if (irqs_disabled())  		print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT +	if (in_atomic_preempt_off()) { +		pr_err("Preemption disabled at:"); +		print_ip_sym(current->preempt_disable_ip); +		pr_cont("\n"); +	} +#endif  	dump_stack();  	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);  } @@ -2287,10 +2640,10 @@ static inline void schedule_debug(struct task_struct *prev)  {  	/*  	 * Test if we are atomic. Since do_exit() needs to call into -	 * schedule() atomically, we ignore that path for now. -	 * Otherwise, whine if we are scheduling when we should not be. +	 * schedule() atomically, we ignore that path. Otherwise whine +	 * if we are scheduling when we should not.  	 */ -	if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) +	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))  		__schedule_bug(prev);  	rcu_sleep_check(); @@ -2299,36 +2652,40 @@ static inline void schedule_debug(struct task_struct *prev)  	schedstat_inc(this_rq(), sched_count);  } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ -	if (prev->on_rq || rq->skip_clock_update < 0) -		update_rq_clock(rq); -	prev->sched_class->put_prev_task(rq, prev); -} -  /*   * Pick up the highest-prio task:   */  static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev)  { -	const struct sched_class *class; +	const struct sched_class *class = &fair_sched_class;  	struct task_struct *p;  	/*  	 * Optimization: we know that if all tasks are in  	 * the fair class we can call that function directly:  	 */ -	if (likely(rq->nr_running == rq->cfs.h_nr_running)) { -		p = fair_sched_class.pick_next_task(rq); -		if (likely(p)) -			return p; +	if (likely(prev->sched_class == class && +		   rq->nr_running == rq->cfs.h_nr_running)) { +		p = fair_sched_class.pick_next_task(rq, prev); +		if (unlikely(p == RETRY_TASK)) +			goto again; + +		/* assumes fair_sched_class->next == idle_sched_class */ +		if (unlikely(!p)) +			p = idle_sched_class.pick_next_task(rq, prev); + +		return p;  	} +again:  	for_each_class(class) { -		p = class->pick_next_task(rq); -		if (p) +		p = class->pick_next_task(rq, prev); +		if (p) { +			if (unlikely(p == RETRY_TASK)) +				goto again;  			return p; +		}  	}  	BUG(); /* the idle class will always have a runnable task */ @@ -2422,14 +2779,12 @@ need_resched:  		switch_count = &prev->nvcsw;  	} -	pre_schedule(rq, prev); - -	if (unlikely(!rq->nr_running)) -		idle_balance(cpu, rq); +	if (prev->on_rq || rq->skip_clock_update < 0) +		update_rq_clock(rq); -	put_prev_task(rq, prev); -	next = pick_next_task(rq); +	next = pick_next_task(rq, prev);  	clear_tsk_need_resched(prev); +	clear_preempt_need_resched();  	rq->skip_clock_update = 0;  	if (likely(prev != next)) { @@ -2468,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk)  		blk_schedule_flush_plug(tsk);  } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void)  {  	struct task_struct *tsk = current; @@ -2478,7 +2833,7 @@ asmlinkage void __sched schedule(void)  EXPORT_SYMBOL(schedule);  #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void)  {  	/*  	 * If we come here after a random call to set_need_resched(), @@ -2510,7 +2865,7 @@ void __sched schedule_preempt_disabled(void)   * off of preempt_enable. Kernel preemptions off return from interrupt   * occur there and call schedule directly.   */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void)  {  	/*  	 * If there is a non-zero preempt_count or interrupts are disabled, @@ -2520,9 +2875,9 @@ asmlinkage void __sched notrace preempt_schedule(void)  		return;  	do { -		add_preempt_count_notrace(PREEMPT_ACTIVE); +		__preempt_count_add(PREEMPT_ACTIVE);  		__schedule(); -		sub_preempt_count_notrace(PREEMPT_ACTIVE); +		__preempt_count_sub(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -2531,7 +2886,9 @@ asmlinkage void __sched notrace preempt_schedule(void)  		barrier();  	} while (need_resched());  } +NOKPROBE_SYMBOL(preempt_schedule);  EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */  /*   * this is the entry point to schedule() from kernel preemption @@ -2539,22 +2896,21 @@ EXPORT_SYMBOL(preempt_schedule);   * Note, that this is called and return with irqs disabled. This will   * protect us against recursive calling from irq.   */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void)  { -	struct thread_info *ti = current_thread_info();  	enum ctx_state prev_state;  	/* Catch callers which need to be fixed */ -	BUG_ON(ti->preempt_count || !irqs_disabled()); +	BUG_ON(preempt_count() || !irqs_disabled());  	prev_state = exception_enter();  	do { -		add_preempt_count(PREEMPT_ACTIVE); +		__preempt_count_add(PREEMPT_ACTIVE);  		local_irq_enable();  		__schedule();  		local_irq_disable(); -		sub_preempt_count(PREEMPT_ACTIVE); +		__preempt_count_sub(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -2566,8 +2922,6 @@ asmlinkage void __sched preempt_schedule_irq(void)  	exception_exit(prev_state);  } -#endif /* CONFIG_PREEMPT */ -  int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  			  void *key)  { @@ -2575,439 +2929,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  }  EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, int wake_flags, void *key) -{ -	wait_queue_t *curr, *next; - -	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { -		unsigned flags = curr->flags; - -		if (curr->func(curr, mode, wake_flags, key) && -				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) -			break; -	} -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, void *key) -{ -	unsigned long flags; - -	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, 0, key); -	spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ -	__wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ -	__wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, void *key) -{ -	unsigned long flags; -	int wake_flags = WF_SYNC; - -	if (unlikely(!q)) -		return; - -	if (unlikely(nr_exclusive != 1)) -		wake_flags = 0; - -	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); -	spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ -	__wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x:  holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ -	unsigned long flags; - -	spin_lock_irqsave(&x->wait.lock, flags); -	x->done++; -	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); -	spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x:  holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ -	unsigned long flags; - -	spin_lock_irqsave(&x->wait.lock, flags); -	x->done += UINT_MAX/2; -	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); -	spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, -		   long (*action)(long), long timeout, int state) -{ -	if (!x->done) { -		DECLARE_WAITQUEUE(wait, current); - -		__add_wait_queue_tail_exclusive(&x->wait, &wait); -		do { -			if (signal_pending_state(state, current)) { -				timeout = -ERESTARTSYS; -				break; -			} -			__set_current_state(state); -			spin_unlock_irq(&x->wait.lock); -			timeout = action(timeout); -			spin_lock_irq(&x->wait.lock); -		} while (!x->done && timeout); -		__remove_wait_queue(&x->wait, &wait); -		if (!x->done) -			return timeout; -	} -	x->done--; -	return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, -		  long (*action)(long), long timeout, int state) -{ -	might_sleep(); - -	spin_lock_irq(&x->wait.lock); -	timeout = do_wait_for_common(x, action, timeout, state); -	spin_unlock_irq(&x->wait.lock); -	return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ -	return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ -	return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ -	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ -	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ -	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x:  holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ -	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); -	if (t == -ERESTARTSYS) -		return t; -	return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, -					  unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ -	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); -	if (t == -ERESTARTSYS) -		return t; -	return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, -				     unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - *	try_wait_for_completion - try to decrement a completion without blocking - *	@x:	completion structure - * - *	Return: 0 if a decrement cannot be done without blocking - *		 1 if a decrement succeeded. - * - *	If a completion is being used as a counting completion, - *	attempt to decrement the counter without blocking. This - *	enables us to avoid waiting if the resource the completion - *	is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ -	unsigned long flags; -	int ret = 1; - -	spin_lock_irqsave(&x->wait.lock, flags); -	if (!x->done) -		ret = 0; -	else -		x->done--; -	spin_unlock_irqrestore(&x->wait.lock, flags); -	return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - *	completion_done - Test to see if a completion has any waiters - *	@x:	completion structure - * - *	Return: 0 if there are waiters (wait_for_completion() in progress) - *		 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ -	unsigned long flags; -	int ret = 1; - -	spin_lock_irqsave(&x->wait.lock, flags); -	if (!x->done) -		ret = 0; -	spin_unlock_irqrestore(&x->wait.lock, flags); -	return ret; -} -EXPORT_SYMBOL(completion_done); - -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ -	unsigned long flags; -	wait_queue_t wait; - -	init_waitqueue_entry(&wait, current); - -	__set_current_state(state); - -	spin_lock_irqsave(&q->lock, flags); -	__add_wait_queue(q, &wait); -	spin_unlock(&q->lock); -	timeout = schedule_timeout(timeout); -	spin_lock_irq(&q->lock); -	__remove_wait_queue(q, &wait); -	spin_unlock_irqrestore(&q->lock, flags); - -	return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ -	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ -	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ -	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ -	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); -  #ifdef CONFIG_RT_MUTEXES  /* @@ -3018,15 +2939,16 @@ EXPORT_SYMBOL(sleep_on_timeout);   * This function changes the 'effective' priority of a task. It does   * not touch ->normal_prio like __setscheduler().   * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed.   */  void rt_mutex_setprio(struct task_struct *p, int prio)  { -	int oldprio, on_rq, running; +	int oldprio, on_rq, running, enqueue_flag = 0;  	struct rq *rq;  	const struct sched_class *prev_class; -	BUG_ON(prio < 0 || prio > MAX_PRIO); +	BUG_ON(prio > MAX_PRIO);  	rq = __task_rq_lock(p); @@ -3049,6 +2971,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	}  	trace_sched_pi_setprio(p, prio); +	p->pi_top_task = rt_mutex_get_top_task(p);  	oldprio = p->prio;  	prev_class = p->sched_class;  	on_rq = p->on_rq; @@ -3058,30 +2981,56 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	if (running)  		p->sched_class->put_prev_task(rq, p); -	if (rt_prio(prio)) +	/* +	 * Boosting condition are: +	 * 1. -rt task is running and holds mutex A +	 *      --> -dl task blocks on mutex A +	 * +	 * 2. -dl task is running and holds mutex A +	 *      --> -dl task blocks on mutex A and could preempt the +	 *          running task +	 */ +	if (dl_prio(prio)) { +		if (!dl_prio(p->normal_prio) || (p->pi_top_task && +			dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { +			p->dl.dl_boosted = 1; +			p->dl.dl_throttled = 0; +			enqueue_flag = ENQUEUE_REPLENISH; +		} else +			p->dl.dl_boosted = 0; +		p->sched_class = &dl_sched_class; +	} else if (rt_prio(prio)) { +		if (dl_prio(oldprio)) +			p->dl.dl_boosted = 0; +		if (oldprio < prio) +			enqueue_flag = ENQUEUE_HEAD;  		p->sched_class = &rt_sched_class; -	else +	} else { +		if (dl_prio(oldprio)) +			p->dl.dl_boosted = 0;  		p->sched_class = &fair_sched_class; +	}  	p->prio = prio;  	if (running)  		p->sched_class->set_curr_task(rq);  	if (on_rq) -		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); +		enqueue_task(rq, p, enqueue_flag);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock:  	__task_rq_unlock(rq);  }  #endif +  void set_user_nice(struct task_struct *p, long nice)  {  	int old_prio, delta, on_rq;  	unsigned long flags;  	struct rq *rq; -	if (TASK_NICE(p) == nice || nice < -20 || nice > 19) +	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)  		return;  	/*  	 * We have to be careful, if called from sys_setpriority(), @@ -3092,9 +3041,9 @@ void set_user_nice(struct task_struct *p, long nice)  	 * The RT priorities are set via sched_setscheduler(), but we still  	 * allow the 'normal' nice value to be set - but as expected  	 * it wont have any effect on scheduling until the task is -	 * SCHED_FIFO/SCHED_RR: +	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:  	 */ -	if (task_has_rt_policy(p)) { +	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  		p->static_prio = NICE_TO_PRIO(nice);  		goto out_unlock;  	} @@ -3130,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);  int can_nice(const struct task_struct *p, const int nice)  {  	/* convert nice value [19,-20] to rlimit style value [1,40] */ -	int nice_rlim = 20 - nice; +	int nice_rlim = nice_to_rlimit(nice);  	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||  		capable(CAP_SYS_NICE)); @@ -3154,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)  	 * We don't have to worry. Conceptually one call occurs first  	 * and we have a single winner.  	 */ -	if (increment < -40) -		increment = -40; -	if (increment > 40) -		increment = 40; - -	nice = TASK_NICE(current) + increment; -	if (nice < -20) -		nice = -20; -	if (nice > 19) -		nice = 19; +	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); +	nice = task_nice(current) + increment; +	nice = clamp_val(nice, MIN_NICE, MAX_NICE);  	if (increment < 0 && !can_nice(current, nice))  		return -EPERM; @@ -3192,18 +3134,6 @@ int task_prio(const struct task_struct *p)  }  /** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ -	return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/**   * idle_cpu - is a given cpu idle currently?   * @cpu: the processor in question.   * @@ -3249,20 +3179,126 @@ static struct task_struct *find_process_by_pid(pid_t pid)  	return pid ? find_task_by_vpid(pid) : current;  } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */  static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr)  { +	struct sched_dl_entity *dl_se = &p->dl; + +	init_dl_task_timer(dl_se); +	dl_se->dl_runtime = attr->sched_runtime; +	dl_se->dl_deadline = attr->sched_deadline; +	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; +	dl_se->flags = attr->sched_flags; +	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); +	dl_se->dl_throttled = 0; +	dl_se->dl_new = 1; +	dl_se->dl_yielded = 0; +} + +static void __setscheduler_params(struct task_struct *p, +		const struct sched_attr *attr) +{ +	int policy = attr->sched_policy; + +	if (policy == -1) /* setparam */ +		policy = p->policy; +  	p->policy = policy; -	p->rt_priority = prio; + +	if (dl_policy(policy)) +		__setparam_dl(p, attr); +	else if (fair_policy(policy)) +		p->static_prio = NICE_TO_PRIO(attr->sched_nice); + +	/* +	 * __sched_setscheduler() ensures attr->sched_priority == 0 when +	 * !rt_policy. Always setting this ensures that things like +	 * getparam()/getattr() don't report silly values for !rt tasks. +	 */ +	p->rt_priority = attr->sched_priority;  	p->normal_prio = normal_prio(p); -	/* we are holding p->pi_lock already */ -	p->prio = rt_mutex_getprio(p); -	if (rt_prio(p->prio)) +	set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, +			   const struct sched_attr *attr) +{ +	__setscheduler_params(p, attr); + +	/* +	 * If we get here, there was no pi waiters boosting the +	 * task. It is safe to use the normal prio. +	 */ +	p->prio = normal_prio(p); + +	if (dl_prio(p->prio)) +		p->sched_class = &dl_sched_class; +	else if (rt_prio(p->prio))  		p->sched_class = &rt_sched_class;  	else  		p->sched_class = &fair_sched_class; -	set_load_weight(p); +} + +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ +	struct sched_dl_entity *dl_se = &p->dl; + +	attr->sched_priority = p->rt_priority; +	attr->sched_runtime = dl_se->dl_runtime; +	attr->sched_deadline = dl_se->dl_deadline; +	attr->sched_period = dl_se->dl_period; +	attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ +	/* deadline != 0 */ +	if (attr->sched_deadline == 0) +		return false; + +	/* +	 * Since we truncate DL_SCALE bits, make sure we're at least +	 * that big. +	 */ +	if (attr->sched_runtime < (1ULL << DL_SCALE)) +		return false; + +	/* +	 * Since we use the MSB for wrap-around and sign issues, make +	 * sure it's not set (mind that period can be equal to zero). +	 */ +	if (attr->sched_deadline & (1ULL << 63) || +	    attr->sched_period & (1ULL << 63)) +		return false; + +	/* runtime <= deadline <= period (if period != 0) */ +	if ((attr->sched_period != 0 && +	     attr->sched_period < attr->sched_deadline) || +	    attr->sched_deadline < attr->sched_runtime) +		return false; + +	return true;  }  /* @@ -3281,10 +3317,14 @@ static bool check_same_owner(struct task_struct *p)  	return match;  } -static int __sched_setscheduler(struct task_struct *p, int policy, -				const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, +				const struct sched_attr *attr, +				bool user)  { +	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : +		      MAX_RT_PRIO - 1 - attr->sched_priority;  	int retval, oldprio, oldpolicy = -1, on_rq, running; +	int policy = attr->sched_policy;  	unsigned long flags;  	const struct sched_class *prev_class;  	struct rq *rq; @@ -3298,31 +3338,40 @@ recheck:  		reset_on_fork = p->sched_reset_on_fork;  		policy = oldpolicy = p->policy;  	} else { -		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -		policy &= ~SCHED_RESET_ON_FORK; +		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); -		if (policy != SCHED_FIFO && policy != SCHED_RR && +		if (policy != SCHED_DEADLINE && +				policy != SCHED_FIFO && policy != SCHED_RR &&  				policy != SCHED_NORMAL && policy != SCHED_BATCH &&  				policy != SCHED_IDLE)  			return -EINVAL;  	} +	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) +		return -EINVAL; +  	/*  	 * Valid priorities for SCHED_FIFO and SCHED_RR are  	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,  	 * SCHED_BATCH and SCHED_IDLE is 0.  	 */ -	if (param->sched_priority < 0 || -	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || -	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) +	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || +	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))  		return -EINVAL; -	if (rt_policy(policy) != (param->sched_priority != 0)) +	if ((dl_policy(policy) && !__checkparam_dl(attr)) || +	    (rt_policy(policy) != (attr->sched_priority != 0)))  		return -EINVAL;  	/*  	 * Allow unprivileged RT tasks to decrease priority:  	 */  	if (user && !capable(CAP_SYS_NICE)) { +		if (fair_policy(policy)) { +			if (attr->sched_nice < task_nice(p) && +			    !can_nice(p, attr->sched_nice)) +				return -EPERM; +		} +  		if (rt_policy(policy)) {  			unsigned long rlim_rtprio =  					task_rlimit(p, RLIMIT_RTPRIO); @@ -3332,17 +3381,26 @@ recheck:  				return -EPERM;  			/* can't increase priority */ -			if (param->sched_priority > p->rt_priority && -			    param->sched_priority > rlim_rtprio) +			if (attr->sched_priority > p->rt_priority && +			    attr->sched_priority > rlim_rtprio)  				return -EPERM;  		} +		 /* +		  * Can't set/change SCHED_DEADLINE policy at all for now +		  * (safest behavior); in the future we would like to allow +		  * unprivileged DL tasks to increase their relative deadline +		  * or reduce their runtime (both ways reducing utilization) +		  */ +		if (dl_policy(policy)) +			return -EPERM; +  		/*  		 * Treat SCHED_IDLE as nice 20. Only allow a switch to  		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.  		 */  		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { -			if (!can_nice(p, TASK_NICE(p))) +			if (!can_nice(p, task_nice(p)))  				return -EPERM;  		} @@ -3379,16 +3437,25 @@ recheck:  	}  	/* -	 * If not changing anything there's no need to proceed further: +	 * If not changing anything there's no need to proceed further, +	 * but store a possible modification of reset_on_fork.  	 */ -	if (unlikely(policy == p->policy && (!rt_policy(policy) || -			param->sched_priority == p->rt_priority))) { +	if (unlikely(policy == p->policy)) { +		if (fair_policy(policy) && attr->sched_nice != task_nice(p)) +			goto change; +		if (rt_policy(policy) && attr->sched_priority != p->rt_priority) +			goto change; +		if (dl_policy(policy)) +			goto change; + +		p->sched_reset_on_fork = reset_on_fork;  		task_rq_unlock(rq, p, &flags);  		return 0;  	} +change: -#ifdef CONFIG_RT_GROUP_SCHED  	if (user) { +#ifdef CONFIG_RT_GROUP_SCHED  		/*  		 * Do not allow realtime tasks into groups that have no runtime  		 * assigned. @@ -3399,8 +3466,24 @@ recheck:  			task_rq_unlock(rq, p, &flags);  			return -EPERM;  		} -	}  #endif +#ifdef CONFIG_SMP +		if (dl_bandwidth_enabled() && dl_policy(policy)) { +			cpumask_t *span = rq->rd->span; + +			/* +			 * Don't allow tasks with an affinity mask smaller than +			 * the entire root_domain to become SCHED_DEADLINE. We +			 * will also fail if there's no bandwidth available. +			 */ +			if (!cpumask_subset(span, &p->cpus_allowed) || +			    rq->rd->dl_bw.bw == 0) { +				task_rq_unlock(rq, p, &flags); +				return -EPERM; +			} +		} +#endif +	}  	/* recheck policy now with rq lock held */  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3408,6 +3491,35 @@ recheck:  		task_rq_unlock(rq, p, &flags);  		goto recheck;  	} + +	/* +	 * If setscheduling to SCHED_DEADLINE (or changing the parameters +	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth +	 * is available. +	 */ +	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { +		task_rq_unlock(rq, p, &flags); +		return -EBUSY; +	} + +	p->sched_reset_on_fork = reset_on_fork; +	oldprio = p->prio; + +	/* +	 * Special case for priority boosted tasks. +	 * +	 * If the new priority is lower or equal (user space view) +	 * than the current (boosted) priority, we just store the new +	 * normal parameters and do not touch the scheduler class and +	 * the runqueue. This will be done when the task deboost +	 * itself. +	 */ +	if (rt_mutex_check_prio(p, newprio)) { +		__setscheduler_params(p, attr); +		task_rq_unlock(rq, p, &flags); +		return 0; +	} +  	on_rq = p->on_rq;  	running = task_current(rq, p);  	if (on_rq) @@ -3415,16 +3527,18 @@ recheck:  	if (running)  		p->sched_class->put_prev_task(rq, p); -	p->sched_reset_on_fork = reset_on_fork; - -	oldprio = p->prio;  	prev_class = p->sched_class; -	__setscheduler(rq, p, policy, param->sched_priority); +	__setscheduler(rq, p, attr);  	if (running)  		p->sched_class->set_curr_task(rq); -	if (on_rq) -		enqueue_task(rq, p, 0); +	if (on_rq) { +		/* +		 * We enqueue to tail when the priority of a task is +		 * increased (user space view). +		 */ +		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); +	}  	check_class_changed(rq, p, prev_class, oldprio);  	task_rq_unlock(rq, p, &flags); @@ -3434,6 +3548,26 @@ recheck:  	return 0;  } +static int _sched_setscheduler(struct task_struct *p, int policy, +			       const struct sched_param *param, bool check) +{ +	struct sched_attr attr = { +		.sched_policy   = policy, +		.sched_priority = param->sched_priority, +		.sched_nice	= PRIO_TO_NICE(p->static_prio), +	}; + +	/* +	 * Fixup the legacy SCHED_RESET_ON_FORK hack +	 */ +	if (policy & SCHED_RESET_ON_FORK) { +		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +		policy &= ~SCHED_RESET_ON_FORK; +		attr.sched_policy = policy; +	} + +	return __sched_setscheduler(p, &attr, check); +}  /**   * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.   * @p: the task in question. @@ -3447,10 +3581,16 @@ recheck:  int sched_setscheduler(struct task_struct *p, int policy,  		       const struct sched_param *param)  { -	return __sched_setscheduler(p, policy, param, true); +	return _sched_setscheduler(p, policy, param, true);  }  EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ +	return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); +  /**   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.   * @p: the task in question. @@ -3467,7 +3607,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);  int sched_setscheduler_nocheck(struct task_struct *p, int policy,  			       const struct sched_param *param)  { -	return __sched_setscheduler(p, policy, param, false); +	return _sched_setscheduler(p, policy, param, false);  }  static int @@ -3492,6 +3632,77 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)  	return retval;  } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, +			   struct sched_attr *attr) +{ +	u32 size; +	int ret; + +	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) +		return -EFAULT; + +	/* +	 * zero the full structure, so that a short copy will be nice. +	 */ +	memset(attr, 0, sizeof(*attr)); + +	ret = get_user(size, &uattr->size); +	if (ret) +		return ret; + +	if (size > PAGE_SIZE)	/* silly large */ +		goto err_size; + +	if (!size)		/* abi compat */ +		size = SCHED_ATTR_SIZE_VER0; + +	if (size < SCHED_ATTR_SIZE_VER0) +		goto err_size; + +	/* +	 * If we're handed a bigger struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. new +	 * user-space does not rely on any kernel feature +	 * extensions we dont know about yet. +	 */ +	if (size > sizeof(*attr)) { +		unsigned char __user *addr; +		unsigned char __user *end; +		unsigned char val; + +		addr = (void __user *)uattr + sizeof(*attr); +		end  = (void __user *)uattr + size; + +		for (; addr < end; addr++) { +			ret = get_user(val, addr); +			if (ret) +				return ret; +			if (val) +				goto err_size; +		} +		size = sizeof(*attr); +	} + +	ret = copy_from_user(attr, uattr, size); +	if (ret) +		return -EFAULT; + +	/* +	 * XXX: do we want to be lenient like existing syscalls; or do we want +	 * to be strict and return an error on out-of-bounds values? +	 */ +	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + +	return 0; + +err_size: +	put_user(sizeof(*attr), &uattr->size); +	return -E2BIG; +} +  /**   * sys_sched_setscheduler - set/change the scheduler policy and RT priority   * @pid: the pid in question. @@ -3523,6 +3734,39 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)  }  /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, +			       unsigned int, flags) +{ +	struct sched_attr attr; +	struct task_struct *p; +	int retval; + +	if (!uattr || pid < 0 || flags) +		return -EINVAL; + +	retval = sched_copy_attr(uattr, &attr); +	if (retval) +		return retval; + +	if ((int)attr.sched_policy < 0) +		return -EINVAL; + +	rcu_read_lock(); +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (p != NULL) +		retval = sched_setattr(p, &attr); +	rcu_read_unlock(); + +	return retval; +} + +/**   * sys_sched_getscheduler - get the policy (scheduling class) of a thread   * @pid: the pid in question.   * @@ -3560,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)   */  SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  { -	struct sched_param lp; +	struct sched_param lp = { .sched_priority = 0 };  	struct task_struct *p;  	int retval; @@ -3577,7 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  	if (retval)  		goto out_unlock; -	lp.sched_priority = p->rt_priority; +	if (task_has_rt_policy(p)) +		lp.sched_priority = p->rt_priority;  	rcu_read_unlock();  	/* @@ -3592,19 +3837,103 @@ out_unlock:  	return retval;  } +static int sched_read_attr(struct sched_attr __user *uattr, +			   struct sched_attr *attr, +			   unsigned int usize) +{ +	int ret; + +	if (!access_ok(VERIFY_WRITE, uattr, usize)) +		return -EFAULT; + +	/* +	 * If we're handed a smaller struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. old +	 * user-space does not get uncomplete information. +	 */ +	if (usize < sizeof(*attr)) { +		unsigned char *addr; +		unsigned char *end; + +		addr = (void *)attr + usize; +		end  = (void *)attr + sizeof(*attr); + +		for (; addr < end; addr++) { +			if (*addr) +				return -EFBIG; +		} + +		attr->size = usize; +	} + +	ret = copy_to_user(uattr, attr, attr->size); +	if (ret) +		return -EFAULT; + +	return 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, +		unsigned int, size, unsigned int, flags) +{ +	struct sched_attr attr = { +		.size = sizeof(struct sched_attr), +	}; +	struct task_struct *p; +	int retval; + +	if (!uattr || pid < 0 || size > PAGE_SIZE || +	    size < SCHED_ATTR_SIZE_VER0 || flags) +		return -EINVAL; + +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	retval = -ESRCH; +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	attr.sched_policy = p->policy; +	if (p->sched_reset_on_fork) +		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +	if (task_has_dl_policy(p)) +		__getparam_dl(p, &attr); +	else if (task_has_rt_policy(p)) +		attr.sched_priority = p->rt_priority; +	else +		attr.sched_nice = task_nice(p); + +	rcu_read_unlock(); + +	retval = sched_read_attr(uattr, &attr, size); +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} +  long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  {  	cpumask_var_t cpus_allowed, new_mask;  	struct task_struct *p;  	int retval; -	get_online_cpus();  	rcu_read_lock();  	p = find_process_by_pid(pid);  	if (!p) {  		rcu_read_unlock(); -		put_online_cpus();  		return -ESRCH;  	} @@ -3638,8 +3967,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	if (retval)  		goto out_unlock; +  	cpuset_cpus_allowed(p, cpus_allowed);  	cpumask_and(new_mask, in_mask, cpus_allowed); + +	/* +	 * Since bandwidth control happens on root_domain basis, +	 * if admission test is enabled, we only admit -deadline +	 * tasks allowed to run on all the CPUs in the task's +	 * root_domain. +	 */ +#ifdef CONFIG_SMP +	if (task_has_dl_policy(p)) { +		const struct cpumask *span = task_rq(p)->rd->span; + +		if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { +			retval = -EBUSY; +			goto out_unlock; +		} +	} +#endif  again:  	retval = set_cpus_allowed_ptr(p, new_mask); @@ -3661,7 +4008,6 @@ out_free_cpus_allowed:  	free_cpumask_var(cpus_allowed);  out_put_task:  	put_task_struct(p); -	put_online_cpus();  	return retval;  } @@ -3706,7 +4052,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  	unsigned long flags;  	int retval; -	get_online_cpus();  	rcu_read_lock();  	retval = -ESRCH; @@ -3719,12 +4064,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  		goto out_unlock;  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); +	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);  out_unlock:  	rcu_read_unlock(); -	put_online_cpus();  	return retval;  } @@ -3794,16 +4138,11 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } -static inline int should_resched(void) -{ -	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} -  static void __cond_resched(void)  { -	add_preempt_count(PREEMPT_ACTIVE); +	__preempt_count_add(PREEMPT_ACTIVE);  	__schedule(); -	sub_preempt_count(PREEMPT_ACTIVE); +	__preempt_count_sub(PREEMPT_ACTIVE);  }  int __sched _cond_resched(void) @@ -3902,7 +4241,7 @@ EXPORT_SYMBOL(yield);   *	false (0) if we failed to boost the target.   *	-ESRCH if there's no task to yield to.   */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt)  {  	struct task_struct *curr = current;  	struct rq *rq, *p_rq; @@ -3924,7 +4263,7 @@ again:  	}  	double_rq_lock(rq, p_rq); -	while (task_rq(p) != p_rq) { +	if (task_rq(p) != p_rq) {  		double_rq_unlock(rq, p_rq);  		goto again;  	} @@ -4013,6 +4352,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)  	case SCHED_RR:  		ret = MAX_USER_RT_PRIO-1;  		break; +	case SCHED_DEADLINE:  	case SCHED_NORMAL:  	case SCHED_BATCH:  	case SCHED_IDLE: @@ -4039,6 +4379,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)  	case SCHED_RR:  		ret = 1;  		break; +	case SCHED_DEADLINE:  	case SCHED_NORMAL:  	case SCHED_BATCH:  	case SCHED_IDLE: @@ -4082,7 +4423,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  		goto out_unlock;  	rq = task_rq_lock(p, &flags); -	time_slice = p->sched_class->get_rr_interval(rq, p); +	time_slice = 0; +	if (p->sched_class->get_rr_interval) +		time_slice = p->sched_class->get_rr_interval(rq, p);  	task_rq_unlock(rq, p, &flags);  	rcu_read_unlock(); @@ -4186,7 +4529,7 @@ void init_idle(struct task_struct *idle, int cpu)  	raw_spin_lock_irqsave(&rq->lock, flags); -	__sched_fork(idle); +	__sched_fork(0, idle);  	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); @@ -4206,13 +4549,14 @@ void init_idle(struct task_struct *idle, int cpu)  	rcu_read_unlock();  	rq->curr = rq->idle = idle; +	idle->on_rq = 1;  #if defined(CONFIG_SMP)  	idle->on_cpu = 1;  #endif  	raw_spin_unlock_irqrestore(&rq->lock, flags);  	/* Set the preempt count _outside_ the spinlocks! */ -	task_thread_info(idle)->preempt_count = 0; +	init_idle_preempt_count(idle, cpu);  	/*  	 * The idle tasks have their own, simple scheduling class: @@ -4346,6 +4690,54 @@ fail:  	return ret;  } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ +	struct migration_arg arg = { p, target_cpu }; +	int curr_cpu = task_cpu(p); + +	if (curr_cpu == target_cpu) +		return 0; + +	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) +		return -EINVAL; + +	/* TODO: This is not properly updating schedstats */ + +	trace_sched_move_numa(p, curr_cpu, target_cpu); +	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ +	struct rq *rq; +	unsigned long flags; +	bool on_rq, running; + +	rq = task_rq_lock(p, &flags); +	on_rq = p->on_rq; +	running = task_current(rq, p); + +	if (on_rq) +		dequeue_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	p->numa_preferred_nid = nid; + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, p, 0); +	task_rq_unlock(rq, p, &flags); +} +#endif +  /*   * migration_cpu_stop - this will be executed by a highprio stopper thread   * and performs thread migration by bumping thread off CPU then @@ -4377,8 +4769,10 @@ void idle_task_exit(void)  	BUG_ON(cpu_online(smp_processor_id())); -	if (mm != &init_mm) +	if (mm != &init_mm) {  		switch_mm(mm, &init_mm, current); +		finish_arch_post_lock_switch(); +	}  	mmdrop(mm);  } @@ -4396,6 +4790,22 @@ static void calc_load_migrate(struct rq *rq)  		atomic_long_add(delta, &calc_load_tasks);  } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { +	.put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { +	/* +	 * Avoid pull_{rt,dl}_task() +	 */ +	.prio = MAX_PRIO + 1, +	.sched_class = &fake_sched_class, +}; +  /*   * Migrate all tasks from the rq, sleeping tasks will be migrated by   * try_to_wake_up()->select_task_rq(). @@ -4436,7 +4846,7 @@ static void migrate_tasks(unsigned int dead_cpu)  		if (rq->nr_running == 1)  			break; -		next = pick_next_task(rq); +		next = pick_next_task(rq, &fake_task);  		BUG_ON(!next);  		next->sched_class->put_prev_task(rq, next); @@ -4526,7 +4936,7 @@ set_table_entry(struct ctl_table *entry,  static struct ctl_table *  sd_alloc_ctl_domain_table(struct sched_domain *sd)  { -	struct ctl_table *table = sd_alloc_ctl_entry(13); +	struct ctl_table *table = sd_alloc_ctl_entry(14);  	if (table == NULL)  		return NULL; @@ -4554,9 +4964,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)  		sizeof(int), 0644, proc_dointvec_minmax, false);  	set_table_entry(&table[10], "flags", &sd->flags,  		sizeof(int), 0644, proc_dointvec_minmax, false); -	set_table_entry(&table[11], "name", sd->name, +	set_table_entry(&table[11], "max_newidle_lb_cost", +		&sd->max_newidle_lb_cost, +		sizeof(long), 0644, proc_doulongvec_minmax, false); +	set_table_entry(&table[12], "name", sd->name,  		CORENAME_MAX_SIZE, 0444, proc_dostring, false); -	/* &table[12] is terminator */ +	/* &table[13] is terminator */  	return table;  } @@ -4722,11 +5135,20 @@ static struct notifier_block migration_notifier = {  	.priority = CPU_PRI_MIGRATION,  }; +static void __cpuinit set_cpu_rq_start_time(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	rq->age_stamp = sched_clock_cpu(cpu); +} +  static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_STARTING: +		set_cpu_rq_start_time(); +		return NOTIFY_OK;  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -4738,13 +5160,31 @@ static int sched_cpu_active(struct notifier_block *nfb,  static int sched_cpu_inactive(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  { +	unsigned long flags; +	long cpu = (long)hcpu; +  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_DOWN_PREPARE: -		set_cpu_active((long)hcpu, false); +		set_cpu_active(cpu, false); + +		/* explicitly allow suspend */ +		if (!(action & CPU_TASKS_FROZEN)) { +			struct dl_bw *dl_b = dl_bw_of(cpu); +			bool overflow; +			int cpus; + +			raw_spin_lock_irqsave(&dl_b->lock, flags); +			cpus = dl_bw_cpus(cpu); +			overflow = __dl_overflow(dl_b, cpus, 0, 0); +			raw_spin_unlock_irqrestore(&dl_b->lock, flags); + +			if (overflow) +				return notifier_from_errno(-EBUSY); +		}  		return NOTIFY_OK; -	default: -		return NOTIFY_DONE;  	} + +	return NOTIFY_DONE;  }  static int __init migration_init(void) @@ -4827,14 +5267,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  		}  		/* -		 * Even though we initialize ->power to something semi-sane, -		 * we leave power_orig unset. This allows us to detect if +		 * Even though we initialize ->capacity to something semi-sane, +		 * we leave capacity_orig unset. This allows us to detect if  		 * domain iteration is still funny without causing /0 traps.  		 */ -		if (!group->sgp->power_orig) { +		if (!group->sgc->capacity_orig) {  			printk(KERN_CONT "\n"); -			printk(KERN_ERR "ERROR: domain->cpu_power not " -					"set\n"); +			printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");  			break;  		} @@ -4856,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));  		printk(KERN_CONT " %s", str); -		if (group->sgp->power != SCHED_POWER_SCALE) { -			printk(KERN_CONT " (cpu_power = %d)", -				group->sgp->power); +		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { +			printk(KERN_CONT " (cpu_capacity = %d)", +				group->sgc->capacity);  		}  		group = group->next; @@ -4916,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd)  			 SD_BALANCE_NEWIDLE |  			 SD_BALANCE_FORK |  			 SD_BALANCE_EXEC | -			 SD_SHARE_CPUPOWER | -			 SD_SHARE_PKG_RESOURCES)) { +			 SD_SHARE_CPUCAPACITY | +			 SD_SHARE_PKG_RESOURCES | +			 SD_SHARE_POWERDOMAIN)) {  		if (sd->groups != sd->groups->next)  			return 0;  	} @@ -4946,9 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  				SD_BALANCE_NEWIDLE |  				SD_BALANCE_FORK |  				SD_BALANCE_EXEC | -				SD_SHARE_CPUPOWER | +				SD_SHARE_CPUCAPACITY |  				SD_SHARE_PKG_RESOURCES | -				SD_PREFER_SIBLING); +				SD_PREFER_SIBLING | +				SD_SHARE_POWERDOMAIN);  		if (nr_node_ids == 1)  			pflags &= ~SD_SERIALIZE;  	} @@ -4963,6 +5404,8 @@ static void free_rootdomain(struct rcu_head *rcu)  	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);  	cpupri_cleanup(&rd->cpupri); +	cpudl_cleanup(&rd->cpudl); +	free_cpumask_var(rd->dlo_mask);  	free_cpumask_var(rd->rto_mask);  	free_cpumask_var(rd->online);  	free_cpumask_var(rd->span); @@ -4985,7 +5428,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)  		cpumask_clear_cpu(rq->cpu, old_rd->span);  		/* -		 * If we dont want to free the old_rt yet then +		 * If we dont want to free the old_rd yet then  		 * set old_rd to NULL to skip the freeing later  		 * in this function:  		 */ @@ -5014,8 +5457,14 @@ static int init_rootdomain(struct root_domain *rd)  		goto out;  	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))  		goto free_span; -	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))  		goto free_online; +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +		goto free_dlo_mask; + +	init_dl_bw(&rd->dl_bw); +	if (cpudl_init(&rd->cpudl) != 0) +		goto free_dlo_mask;  	if (cpupri_init(&rd->cpupri) != 0)  		goto free_rto_mask; @@ -5023,6 +5472,8 @@ static int init_rootdomain(struct root_domain *rd)  free_rto_mask:  	free_cpumask_var(rd->rto_mask); +free_dlo_mask: +	free_cpumask_var(rd->dlo_mask);  free_online:  	free_cpumask_var(rd->online);  free_span: @@ -5060,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void)  	return rd;  } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc)  {  	struct sched_group *tmp, *first; @@ -5071,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)  	do {  		tmp = sg->next; -		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) -			kfree(sg->sgp); +		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) +			kfree(sg->sgc);  		kfree(sg);  		sg = tmp; @@ -5090,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu)  	if (sd->flags & SD_OVERLAP) {  		free_sched_groups(sd->groups, 1);  	} else if (atomic_dec_and_test(&sd->groups->ref)) { -		kfree(sd->groups->sgp); +		kfree(sd->groups->sgc);  		kfree(sd->groups);  	}  	kfree(sd); @@ -5119,10 +5570,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)  DEFINE_PER_CPU(struct sched_domain *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_size);  DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym);  static void update_top_cache_domain(int cpu)  {  	struct sched_domain *sd; +	struct sched_domain *busy_sd = NULL;  	int id = cpu;  	int size = 1; @@ -5130,11 +5585,19 @@ static void update_top_cache_domain(int cpu)  	if (sd) {  		id = cpumask_first(sched_domain_span(sd));  		size = cpumask_weight(sched_domain_span(sd)); +		busy_sd = sd->parent; /* sd_busy */  	} +	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);  	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);  	per_cpu(sd_llc_size, cpu) = size;  	per_cpu(sd_llc_id, cpu) = id; + +	sd = lowest_flag_domain(cpu, SD_NUMA); +	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + +	sd = highest_flag_domain(cpu, SD_ASYM_PACKING); +	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);  }  /* @@ -5200,17 +5663,6 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ -	return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { -	struct sched_domain **__percpu sd; -	struct sched_group **__percpu sg; -	struct sched_group_power **__percpu sgp; -}; -  struct s_data {  	struct sched_domain ** __percpu sd;  	struct root_domain	*rd; @@ -5223,21 +5675,6 @@ enum s_alloc {  	sa_none,  }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP	0x01 - -struct sched_domain_topology_level { -	sched_domain_init_f init; -	sched_domain_mask_f mask; -	int		    flags; -	int		    numa_level; -	struct sd_data      data; -}; -  /*   * Build an iteration mask that can exclude certain CPUs from the upwards   * domain traversal. @@ -5315,16 +5752,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  		cpumask_or(covered, covered, sg_span); -		sg->sgp = *per_cpu_ptr(sdd->sgp, i); -		if (atomic_inc_return(&sg->sgp->ref) == 1) +		sg->sgc = *per_cpu_ptr(sdd->sgc, i); +		if (atomic_inc_return(&sg->sgc->ref) == 1)  			build_group_mask(sd, sg);  		/* -		 * Initialize sgp->power such that even if we mess up the +		 * Initialize sgc->capacity such that even if we mess up the  		 * domains and no possible iteration will get us here, we won't  		 * die on a /0 trap.  		 */ -		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); +		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); +		sg->sgc->capacity_orig = sg->sgc->capacity;  		/*  		 * Make sure the first group of this domain contains the @@ -5362,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)  	if (sg) {  		*sg = *per_cpu_ptr(sdd->sg, cpu); -		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); -		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ +		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); +		atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */  	}  	return cpu; @@ -5372,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)  /*   * build_sched_groups will build a circular linked list of the groups   * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0.   *   * Assumes the sched_domain tree is fully constructed   */ @@ -5404,8 +5842,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)  			continue;  		group = get_group(i, sdd, &sg); -		cpumask_clear(sched_group_cpus(sg)); -		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) { @@ -5428,16 +5864,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)  }  /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity.   * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while   * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity.   */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)  {  	struct sched_group *sg = sd->groups; @@ -5451,13 +5887,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  	if (cpu != group_balance_cpu(sg))  		return; -	update_group_power(sd, cpu); -	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ -       return 0*SD_ASYM_PACKING; +	update_group_capacity(sd, cpu); +	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);  }  /* @@ -5465,34 +5896,6 @@ int __weak arch_sd_sibling_asym_packing(void)   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()   */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type)		sd->name = #type -#else -# define SD_INIT_NAME(sd, type)		do { } while (0) -#endif - -#define SD_INIT_FUNC(type)						\ -static noinline struct sched_domain *					\ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ -{									\ -	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ -	*sd = SD_##type##_INIT;						\ -	SD_INIT_NAME(sd, type);						\ -	sd->private = &tl->data;					\ -	return sd;							\ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif -  static int default_relax_domain_level = -1;  int sched_domain_level_max; @@ -5576,100 +5979,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)  	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))  		*per_cpu_ptr(sdd->sg, cpu) = NULL; -	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) -		*per_cpu_ptr(sdd->sgp, cpu) = NULL; -} - -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ -	return topology_thread_cpumask(cpu); +	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) +		*per_cpu_ptr(sdd->sgc, cpu) = NULL;  } -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT -	{ sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC -	{ sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK -	{ sd_init_BOOK, cpu_book_mask, }, -#endif -	{ sd_init_CPU, cpu_cpu_mask, }, -	{ NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl)			\ -	for (tl = sched_domain_topology; tl->init; tl++)  #ifdef CONFIG_NUMA -  static int sched_domains_numa_levels;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ -	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) -		return 0; - -	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUCAPACITY      - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA                - describes NUMA topologies + * SD_SHARE_POWERDOMAIN   - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING        - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS		\ +	(SD_SHARE_CPUCAPACITY |		\ +	 SD_SHARE_PKG_RESOURCES |	\ +	 SD_NUMA |			\ +	 SD_ASYM_PACKING |		\ +	 SD_SHARE_POWERDOMAIN)  static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu)  {  	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); -	int level = tl->numa_level; -	int sd_weight = cpumask_weight( -			sched_domains_numa_masks[level][cpu_to_node(cpu)]); +	int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; +#endif + +	sd_weight = cpumask_weight(tl->mask(cpu)); + +	if (tl->sd_flags) +		sd_flags = (*tl->sd_flags)(); +	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, +			"wrong sd_flags in topology description\n")) +		sd_flags &= ~TOPOLOGY_SD_FLAGS;  	*sd = (struct sched_domain){  		.min_interval		= sd_weight,  		.max_interval		= 2*sd_weight,  		.busy_factor		= 32,  		.imbalance_pct		= 125, -		.cache_nice_tries	= 2, -		.busy_idx		= 3, -		.idle_idx		= 2, + +		.cache_nice_tries	= 0, +		.busy_idx		= 0, +		.idle_idx		= 0,  		.newidle_idx		= 0,  		.wake_idx		= 0,  		.forkexec_idx		= 0,  		.flags			= 1*SD_LOAD_BALANCE  					| 1*SD_BALANCE_NEWIDLE -					| 0*SD_BALANCE_EXEC -					| 0*SD_BALANCE_FORK +					| 1*SD_BALANCE_EXEC +					| 1*SD_BALANCE_FORK  					| 0*SD_BALANCE_WAKE -					| 0*SD_WAKE_AFFINE -					| 0*SD_SHARE_CPUPOWER +					| 1*SD_WAKE_AFFINE +					| 0*SD_SHARE_CPUCAPACITY  					| 0*SD_SHARE_PKG_RESOURCES -					| 1*SD_SERIALIZE +					| 0*SD_SERIALIZE  					| 0*SD_PREFER_SIBLING -					| sd_local_flags(level) +					| 0*SD_NUMA +					| sd_flags  					, +  		.last_balance		= jiffies,  		.balance_interval	= sd_weight, +		.smt_gain		= 0, +		.max_newidle_lb_cost	= 0, +		.next_decay_max_lb_cost	= jiffies, +#ifdef CONFIG_SCHED_DEBUG +		.name			= tl->name, +#endif  	}; -	SD_INIT_NAME(sd, NUMA); -	sd->private = &tl->data;  	/* -	 * Ugly hack to pass state to sd_numa_mask()... +	 * Convert topological properties into behaviour.  	 */ -	sched_domains_curr_level = tl->numa_level; + +	if (sd->flags & SD_SHARE_CPUCAPACITY) { +		sd->imbalance_pct = 110; +		sd->smt_gain = 1178; /* ~15% */ + +	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { +		sd->imbalance_pct = 117; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; + +#ifdef CONFIG_NUMA +	} else if (sd->flags & SD_NUMA) { +		sd->cache_nice_tries = 2; +		sd->busy_idx = 3; +		sd->idle_idx = 2; + +		sd->flags |= SD_SERIALIZE; +		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { +			sd->flags &= ~(SD_BALANCE_EXEC | +				       SD_BALANCE_FORK | +				       SD_WAKE_AFFINE); +		} + +#endif +	} else { +		sd->flags |= SD_PREFER_SIBLING; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; +		sd->idle_idx = 1; +	} + +	sd->private = &tl->data;  	return sd;  } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC +	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) }, +	{ NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ +	sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA +  static const struct cpumask *sd_numa_mask(int cpu)  {  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -5813,7 +6274,10 @@ static void sched_init_numa(void)  		}  	} -	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +	/* Compute default topology size */ +	for (i = 0; sched_domain_topology[i].mask; i++); + +	tl = kzalloc((i + level + 1) *  			sizeof(struct sched_domain_topology_level), GFP_KERNEL);  	if (!tl)  		return; @@ -5821,18 +6285,19 @@ static void sched_init_numa(void)  	/*  	 * Copy the default topology bits..  	 */ -	for (i = 0; default_topology[i].init; i++) -		tl[i] = default_topology[i]; +	for (i = 0; sched_domain_topology[i].mask; i++) +		tl[i] = sched_domain_topology[i];  	/*  	 * .. and append 'j' levels of NUMA goodness.  	 */  	for (j = 0; j < level; i++, j++) {  		tl[i] = (struct sched_domain_topology_level){ -			.init = sd_numa_init,  			.mask = sd_numa_mask, +			.sd_flags = cpu_numa_flags,  			.flags = SDTL_OVERLAP,  			.numa_level = j, +			SD_INIT_NAME(NUMA)  		};  	} @@ -5917,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  		if (!sdd->sg)  			return -ENOMEM; -		sdd->sgp = alloc_percpu(struct sched_group_power *); -		if (!sdd->sgp) +		sdd->sgc = alloc_percpu(struct sched_group_capacity *); +		if (!sdd->sgc)  			return -ENOMEM;  		for_each_cpu(j, cpu_map) {  			struct sched_domain *sd;  			struct sched_group *sg; -			struct sched_group_power *sgp; +			struct sched_group_capacity *sgc;  		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j)); @@ -5942,12 +6407,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  			*per_cpu_ptr(sdd->sg, j) = sg; -			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), +			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j)); -			if (!sgp) +			if (!sgc)  				return -ENOMEM; -			*per_cpu_ptr(sdd->sgp, j) = sgp; +			*per_cpu_ptr(sdd->sgc, j) = sgc;  		}  	} @@ -5974,15 +6439,15 @@ static void __sdt_free(const struct cpumask *cpu_map)  			if (sdd->sg)  				kfree(*per_cpu_ptr(sdd->sg, j)); -			if (sdd->sgp) -				kfree(*per_cpu_ptr(sdd->sgp, j)); +			if (sdd->sgc) +				kfree(*per_cpu_ptr(sdd->sgc, j));  		}  		free_percpu(sdd->sd);  		sdd->sd = NULL;  		free_percpu(sdd->sg);  		sdd->sg = NULL; -		free_percpu(sdd->sgp); -		sdd->sgp = NULL; +		free_percpu(sdd->sgc); +		sdd->sgc = NULL;  	}  } @@ -5990,7 +6455,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = tl->init(tl, cpu); +	struct sched_domain *sd = sd_init(tl, cpu);  	if (!sd)  		return child; @@ -6052,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,  		}  	} -	/* Calculate CPU power for physical packages and nodes */ +	/* Calculate CPU capacity for physical packages and nodes */  	for (i = nr_cpumask_bits-1; i >= 0; i--) {  		if (!cpumask_test_cpu(i, cpu_map))  			continue;  		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {  			claim_allocations(i, sd); -			init_sched_groups_power(i, sd); +			init_sched_groups_capacity(i, sd);  		}  	} @@ -6094,7 +6559,7 @@ static cpumask_var_t fallback_doms;   * cpu core maps. It is supposed to return 1 if the topology changed   * or 0 if it stayed the same.   */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void)  {  	return 0;  } @@ -6335,14 +6800,17 @@ void __init sched_init_smp(void)  	sched_init_numa(); -	get_online_cpus(); +	/* +	 * There's no userspace yet to cause hotplug operations; hence all the +	 * cpu masks are stable and all blatant races in the below code cannot +	 * happen. +	 */  	mutex_lock(&sched_domains_mutex);  	init_sched_domains(cpu_active_mask);  	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);  	if (cpumask_empty(non_isolated_cpus))  		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);  	mutex_unlock(&sched_domains_mutex); -	put_online_cpus();  	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);  	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -6357,6 +6825,7 @@ void __init sched_init_smp(void)  	free_cpumask_var(non_isolated_cpus);  	init_sched_rt_class(); +	init_sched_dl_class();  }  #else  void __init sched_init_smp(void) @@ -6426,13 +6895,15 @@ void __init sched_init(void)  #endif /* CONFIG_CPUMASK_OFFSTACK */  	} +	init_rt_bandwidth(&def_rt_bandwidth, +			global_rt_period(), global_rt_runtime()); +	init_dl_bandwidth(&def_dl_bandwidth, +			global_rt_period(), global_rt_runtime()); +  #ifdef CONFIG_SMP  	init_defrootdomain();  #endif -	init_rt_bandwidth(&def_rt_bandwidth, -			global_rt_period(), global_rt_runtime()); -  #ifdef CONFIG_RT_GROUP_SCHED  	init_rt_bandwidth(&root_task_group.rt_bandwidth,  			global_rt_period(), global_rt_runtime()); @@ -6456,6 +6927,7 @@ void __init sched_init(void)  		rq->calc_load_update = jiffies + LOAD_FREQ;  		init_cfs_rq(&rq->cfs);  		init_rt_rq(&rq->rt, rq); +		init_dl_rq(&rq->dl, rq);  #ifdef CONFIG_FAIR_GROUP_SCHED  		root_task_group.shares = ROOT_TASK_GROUP_LOAD;  		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6484,7 +6956,6 @@ void __init sched_init(void)  		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;  #ifdef CONFIG_RT_GROUP_SCHED -		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);  		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);  #endif @@ -6496,7 +6967,7 @@ void __init sched_init(void)  #ifdef CONFIG_SMP  		rq->sd = NULL;  		rq->rd = NULL; -		rq->cpu_power = SCHED_POWER_SCALE; +		rq->cpu_capacity = SCHED_CAPACITY_SCALE;  		rq->post_schedule = 0;  		rq->active_balance = 0;  		rq->next_balance = jiffies; @@ -6505,6 +6976,7 @@ void __init sched_init(void)  		rq->online = 0;  		rq->idle_stamp = 0;  		rq->avg_idle = 2*sysctl_sched_migration_cost; +		rq->max_idle_balance_cost = sysctl_sched_migration_cost;  		INIT_LIST_HEAD(&rq->cfs_tasks); @@ -6526,10 +6998,6 @@ void __init sched_init(void)  	INIT_HLIST_HEAD(&init_task.preempt_notifiers);  #endif -#ifdef CONFIG_RT_MUTEXES -	plist_head_init(&init_task.pi_waiters); -#endif -  	/*  	 * The boot idle thread does lazy MMU switching as well:  	 */ @@ -6557,6 +7025,7 @@ void __init sched_init(void)  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  	idle_thread_set_boot_cpu(); +	set_cpu_rq_start_time();  #endif  	init_sched_fair_class(); @@ -6576,7 +7045,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)  	static unsigned long prev_jiffy;	/* ratelimiting */  	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ -	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || +	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && +	     !is_idle_task(current)) ||  	    system_state != SYSTEM_RUNNING || oops_in_progress)  		return;  	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6594,6 +7064,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)  	debug_show_held_locks(current);  	if (irqs_disabled())  		print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT +	if (!preempt_count_equals(preempt_offset)) { +		pr_err("Preemption disabled at:"); +		print_ip_sym(current->preempt_disable_ip); +		pr_cont("\n"); +	} +#endif  	dump_stack();  }  EXPORT_SYMBOL(__might_sleep); @@ -6603,13 +7080,16 @@ EXPORT_SYMBOL(__might_sleep);  static void normalize_task(struct rq *rq, struct task_struct *p)  {  	const struct sched_class *prev_class = p->sched_class; +	struct sched_attr attr = { +		.sched_policy = SCHED_NORMAL, +	};  	int old_prio = p->prio;  	int on_rq;  	on_rq = p->on_rq;  	if (on_rq)  		dequeue_task(rq, p, 0); -	__setscheduler(rq, p, SCHED_NORMAL, 0); +	__setscheduler(rq, p, &attr);  	if (on_rq) {  		enqueue_task(rq, p, 0);  		resched_task(rq->curr); @@ -6639,12 +7119,12 @@ void normalize_rt_tasks(void)  		p->se.statistics.block_start	= 0;  #endif -		if (!rt_task(p)) { +		if (!dl_task(p) && !rt_task(p)) {  			/*  			 * Renice negative nice level userspace  			 * tasks back to 0:  			 */ -			if (TASK_NICE(p) < 0 && p->mm) +			if (task_nice(p) < 0 && p->mm)  				set_user_nice(p, 0);  			continue;  		} @@ -6812,7 +7292,7 @@ void sched_move_task(struct task_struct *tsk)  	if (unlikely(running))  		tsk->sched_class->put_prev_task(rq, tsk); -	tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, +	tg = container_of(task_css_check(tsk, cpu_cgrp_id,  				lockdep_is_held(&tsk->sighand->siglock)),  			  struct task_group, css);  	tg = autogroup_task_group(tsk, tg); @@ -6834,16 +7314,6 @@ void sched_move_task(struct task_struct *tsk)  }  #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ -	if (runtime == RUNTIME_INF) -		return 1ULL << 20; - -	return div64_u64(runtime << 20, period); -} -#endif -  #ifdef CONFIG_RT_GROUP_SCHED  /*   * Ensure that the real time constraints are schedulable. @@ -7017,24 +7487,13 @@ static long sched_group_rt_period(struct task_group *tg)  	do_div(rt_period_us, NSEC_PER_USEC);  	return rt_period_us;  } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED  static int sched_rt_global_constraints(void)  { -	u64 runtime, period;  	int ret = 0; -	if (sysctl_sched_rt_period <= 0) -		return -EINVAL; - -	runtime = global_rt_runtime(); -	period = global_rt_period(); - -	/* -	 * Sanity check on the sysctl variables. -	 */ -	if (runtime > period && runtime != RUNTIME_INF) -		return -EINVAL; -  	mutex_lock(&rt_constraints_mutex);  	read_lock(&tasklist_lock);  	ret = __rt_schedulable(NULL, 0, 0); @@ -7057,17 +7516,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)  static int sched_rt_global_constraints(void)  {  	unsigned long flags; -	int i; - -	if (sysctl_sched_rt_period <= 0) -		return -EINVAL; - -	/* -	 * There's always some RT tasks in the root group -	 * -- migration, kstopmachine etc.. -	 */ -	if (sysctl_sched_rt_runtime == 0) -		return -EBUSY; +	int i, ret = 0;  	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);  	for_each_possible_cpu(i) { @@ -7079,36 +7528,91 @@ static int sched_rt_global_constraints(void)  	}  	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); -	return 0; +	return ret;  }  #endif /* CONFIG_RT_GROUP_SCHED */ -int sched_rr_handler(struct ctl_table *table, int write, -		void __user *buffer, size_t *lenp, -		loff_t *ppos) +static int sched_dl_global_constraints(void)  { -	int ret; -	static DEFINE_MUTEX(mutex); +	u64 runtime = global_rt_runtime(); +	u64 period = global_rt_period(); +	u64 new_bw = to_ratio(period, runtime); +	int cpu, ret = 0; +	unsigned long flags; -	mutex_lock(&mutex); -	ret = proc_dointvec(table, write, buffer, lenp, ppos); -	/* make sure that internally we keep jiffies */ -	/* also, writing zero resets timeslice to default */ -	if (!ret && write) { -		sched_rr_timeslice = sched_rr_timeslice <= 0 ? -			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	/* +	 * Here we want to check the bandwidth not being set to some +	 * value smaller than the currently allocated bandwidth in +	 * any of the root_domains. +	 * +	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than +	 * cycling on root_domains... Discussion on different/better +	 * solutions is welcome! +	 */ +	for_each_possible_cpu(cpu) { +		struct dl_bw *dl_b = dl_bw_of(cpu); + +		raw_spin_lock_irqsave(&dl_b->lock, flags); +		if (new_bw < dl_b->total_bw) +			ret = -EBUSY; +		raw_spin_unlock_irqrestore(&dl_b->lock, flags); + +		if (ret) +			break;  	} -	mutex_unlock(&mutex); +  	return ret;  } +static void sched_dl_do_global(void) +{ +	u64 new_bw = -1; +	int cpu; +	unsigned long flags; + +	def_dl_bandwidth.dl_period = global_rt_period(); +	def_dl_bandwidth.dl_runtime = global_rt_runtime(); + +	if (global_rt_runtime() != RUNTIME_INF) +		new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + +	/* +	 * FIXME: As above... +	 */ +	for_each_possible_cpu(cpu) { +		struct dl_bw *dl_b = dl_bw_of(cpu); + +		raw_spin_lock_irqsave(&dl_b->lock, flags); +		dl_b->bw = new_bw; +		raw_spin_unlock_irqrestore(&dl_b->lock, flags); +	} +} + +static int sched_rt_global_validate(void) +{ +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	if ((sysctl_sched_rt_runtime != RUNTIME_INF) && +		(sysctl_sched_rt_runtime > sysctl_sched_rt_period)) +		return -EINVAL; + +	return 0; +} + +static void sched_rt_do_global(void) +{ +	def_rt_bandwidth.rt_runtime = global_rt_runtime(); +	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} +  int sched_rt_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp,  		loff_t *ppos)  { -	int ret;  	int old_period, old_runtime;  	static DEFINE_MUTEX(mutex); +	int ret;  	mutex_lock(&mutex);  	old_period = sysctl_sched_rt_period; @@ -7117,21 +7621,50 @@ int sched_rt_handler(struct ctl_table *table, int write,  	ret = proc_dointvec(table, write, buffer, lenp, ppos);  	if (!ret && write) { +		ret = sched_rt_global_validate(); +		if (ret) +			goto undo; +  		ret = sched_rt_global_constraints(); -		if (ret) { -			sysctl_sched_rt_period = old_period; -			sysctl_sched_rt_runtime = old_runtime; -		} else { -			def_rt_bandwidth.rt_runtime = global_rt_runtime(); -			def_rt_bandwidth.rt_period = -				ns_to_ktime(global_rt_period()); -		} +		if (ret) +			goto undo; + +		ret = sched_dl_global_constraints(); +		if (ret) +			goto undo; + +		sched_rt_do_global(); +		sched_dl_do_global(); +	} +	if (0) { +undo: +		sysctl_sched_rt_period = old_period; +		sysctl_sched_rt_runtime = old_runtime;  	}  	mutex_unlock(&mutex);  	return ret;  } +int sched_rr_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret; +	static DEFINE_MUTEX(mutex); + +	mutex_lock(&mutex); +	ret = proc_dointvec(table, write, buffer, lenp, ppos); +	/* make sure that internally we keep jiffies */ +	/* also, writing zero resets timeslice to default */ +	if (!ret && write) { +		sched_rr_timeslice = sched_rr_timeslice <= 0 ? +			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	} +	mutex_unlock(&mutex); +	return ret; +} +  #ifdef CONFIG_CGROUP_SCHED  static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -7160,7 +7693,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct task_group *tg = css_tg(css); -	struct task_group *parent = css_tg(css_parent(css)); +	struct task_group *parent = css_tg(css->parent);  	if (parent)  		sched_online_group(tg, parent); @@ -7186,7 +7719,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,  {  	struct task_struct *task; -	cgroup_taskset_for_each(task, css, tset) { +	cgroup_taskset_for_each(task, tset) {  #ifdef CONFIG_RT_GROUP_SCHED  		if (!sched_rt_can_attach(css_tg(css), task))  			return -EINVAL; @@ -7204,7 +7737,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,  {  	struct task_struct *task; -	cgroup_taskset_for_each(task, css, tset) +	cgroup_taskset_for_each(task, tset)  		sched_move_task(task);  } @@ -7277,7 +7810,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	runtime_enabled = quota != RUNTIME_INF;  	runtime_was_enabled = cfs_b->quota != RUNTIME_INF; -	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); +	/* +	 * If we need to toggle cfs_bandwidth_used, off->on must occur +	 * before making related changes, and on->off must occur afterwards +	 */ +	if (runtime_enabled && !runtime_was_enabled) +		cfs_bandwidth_usage_inc();  	raw_spin_lock_irq(&cfs_b->lock);  	cfs_b->period = ns_to_ktime(period);  	cfs_b->quota = quota; @@ -7286,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	/* restart the period timer (if active) to handle new period expiry */  	if (runtime_enabled && cfs_b->timer_active) {  		/* force a reprogram */ -		cfs_b->timer_active = 0; -		__start_cfs_bandwidth(cfs_b); +		__start_cfs_bandwidth(cfs_b, true);  	}  	raw_spin_unlock_irq(&cfs_b->lock); @@ -7303,6 +7840,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  			unthrottle_cfs_rq(cfs_rq);  		raw_spin_unlock_irq(&rq->lock);  	} +	if (runtime_was_enabled && !runtime_enabled) +		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); @@ -7457,15 +7996,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)  	return ret;  } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, -		struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v)  { -	struct task_group *tg = css_tg(css); +	struct task_group *tg = css_tg(seq_css(sf));  	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -	cb->fill(cb, "nr_periods", cfs_b->nr_periods); -	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); -	cb->fill(cb, "throttled_time", cfs_b->throttled_time); +	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); +	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); +	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);  	return 0;  } @@ -7519,7 +8057,7 @@ static struct cftype cpu_files[] = {  	},  	{  		.name = "stat", -		.read_map = cpu_stats_show, +		.seq_show = cpu_stats_show,  	},  #endif  #ifdef CONFIG_RT_GROUP_SCHED @@ -7537,8 +8075,7 @@ static struct cftype cpu_files[] = {  	{ }	/* terminate */  }; -struct cgroup_subsys cpu_cgroup_subsys = { -	.name		= "cpu", +struct cgroup_subsys cpu_cgrp_subsys = {  	.css_alloc	= cpu_cgroup_css_alloc,  	.css_free	= cpu_cgroup_css_free,  	.css_online	= cpu_cgroup_css_online, @@ -7546,7 +8083,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {  	.can_attach	= cpu_cgroup_can_attach,  	.attach		= cpu_cgroup_attach,  	.exit		= cpu_cgroup_exit, -	.subsys_id	= cpu_cgroup_subsys_id,  	.base_cftypes	= cpu_files,  	.early_init	= 1,  }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722ff029..9cf350c94ec 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,12 +41,12 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)  /* return cpu accounting group to which this task belongs */  static inline struct cpuacct *task_ca(struct task_struct *tsk)  { -	return css_ca(task_css(tsk, cpuacct_subsys_id)); +	return css_ca(task_css(tsk, cpuacct_cgrp_id));  }  static inline struct cpuacct *parent_ca(struct cpuacct *ca)  { -	return css_ca(css_parent(&ca->css)); +	return css_ca(ca->css.parent);  }  static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -163,10 +163,9 @@ out:  	return err;  } -static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, -				   struct cftype *cft, struct seq_file *m) +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)  { -	struct cpuacct *ca = css_ca(css); +	struct cpuacct *ca = css_ca(seq_css(m));  	u64 percpu;  	int i; @@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {  	[CPUACCT_STAT_SYSTEM] = "system",  }; -static int cpuacct_stats_show(struct cgroup_subsys_state *css, -			      struct cftype *cft, struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct seq_file *sf, void *v)  { -	struct cpuacct *ca = css_ca(css); +	struct cpuacct *ca = css_ca(seq_css(sf));  	int cpu;  	s64 val = 0; @@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,  		val += kcpustat->cpustat[CPUTIME_NICE];  	}  	val = cputime64_to_clock_t(val); -	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); +	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);  	val = 0;  	for_each_online_cpu(cpu) { @@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,  	}  	val = cputime64_to_clock_t(val); -	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); +	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);  	return 0;  } @@ -220,11 +218,11 @@ static struct cftype files[] = {  	},  	{  		.name = "usage_percpu", -		.read_seq_string = cpuacct_percpu_seq_read, +		.seq_show = cpuacct_percpu_seq_show,  	},  	{  		.name = "stat", -		.read_map = cpuacct_stats_show, +		.seq_show = cpuacct_stats_show,  	},  	{ }	/* terminate */  }; @@ -277,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)  	rcu_read_unlock();  } -struct cgroup_subsys cpuacct_subsys = { -	.name		= "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = {  	.css_alloc	= cpuacct_css_alloc,  	.css_free	= cpuacct_css_free, -	.subsys_id	= cpuacct_subsys_id,  	.base_cftypes	= files,  	.early_init	= 1,  }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 00000000000..bd95963dae8 --- /dev/null +++ b/kernel/sched/cpudeadline.c @@ -0,0 +1,229 @@ +/* + *  kernel/sched/cpudl.c + * + *  Global CPU deadline management + * + *  Author: Juri Lelli <j.lelli@sssup.it> + * + *  This program is free software; you can redistribute it and/or + *  modify it under the terms of the GNU General Public License + *  as published by the Free Software Foundation; version 2 + *  of the License. + */ + +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include "cpudeadline.h" + +static inline int parent(int i) +{ +	return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ +	return (i << 1) + 1; +} + +static inline int right_child(int i) +{ +	return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ +	return (s64)(a - b) < 0; +} + +static void cpudl_exchange(struct cpudl *cp, int a, int b) +{ +	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + +	swap(cp->elements[a].cpu, cp->elements[b].cpu); +	swap(cp->elements[a].dl , cp->elements[b].dl ); + +	swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ +	int l, r, largest; + +	/* adapted from lib/prio_heap.c */ +	while(1) { +		l = left_child(idx); +		r = right_child(idx); +		largest = idx; + +		if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, +							cp->elements[l].dl)) +			largest = l; +		if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, +							cp->elements[r].dl)) +			largest = r; +		if (largest == idx) +			break; + +		/* Push idx down the heap one level and bump one up */ +		cpudl_exchange(cp, largest, idx); +		idx = largest; +	} +} + +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ +	WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); + +	if (dl_time_before(new_dl, cp->elements[idx].dl)) { +		cp->elements[idx].dl = new_dl; +		cpudl_heapify(cp, idx); +	} else { +		cp->elements[idx].dl = new_dl; +		while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, +					cp->elements[idx].dl)) { +			cpudl_exchange(cp, idx, parent(idx)); +			idx = parent(idx); +		} +	} +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ +	return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct task_struct *p, +	       struct cpumask *later_mask) +{ +	int best_cpu = -1; +	const struct sched_dl_entity *dl_se = &p->dl; + +	if (later_mask && cpumask_and(later_mask, cp->free_cpus, +			&p->cpus_allowed) && cpumask_and(later_mask, +			later_mask, cpu_active_mask)) { +		best_cpu = cpumask_any(later_mask); +		goto out; +	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && +			dl_time_before(dl_se->deadline, cp->elements[0].dl)) { +		best_cpu = cpudl_maximum(cp); +		if (later_mask) +			cpumask_set_cpu(best_cpu, later_mask); +	} + +out: +	WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + +	return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ +	int old_idx, new_cpu; +	unsigned long flags; + +	WARN_ON(!cpu_present(cpu)); + +	raw_spin_lock_irqsave(&cp->lock, flags); +	old_idx = cp->elements[cpu].idx; +	if (!is_valid) { +		/* remove item */ +		if (old_idx == IDX_INVALID) { +			/* +			 * Nothing to remove if old_idx was invalid. +			 * This could happen if a rq_offline_dl is +			 * called for a CPU without -dl tasks running. +			 */ +			goto out; +		} +		new_cpu = cp->elements[cp->size - 1].cpu; +		cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; +		cp->elements[old_idx].cpu = new_cpu; +		cp->size--; +		cp->elements[new_cpu].idx = old_idx; +		cp->elements[cpu].idx = IDX_INVALID; +		while (old_idx > 0 && dl_time_before( +				cp->elements[parent(old_idx)].dl, +				cp->elements[old_idx].dl)) { +			cpudl_exchange(cp, old_idx, parent(old_idx)); +			old_idx = parent(old_idx); +		} +		cpumask_set_cpu(cpu, cp->free_cpus); +                cpudl_heapify(cp, old_idx); + +		goto out; +	} + +	if (old_idx == IDX_INVALID) { +		cp->size++; +		cp->elements[cp->size - 1].dl = 0; +		cp->elements[cp->size - 1].cpu = cpu; +		cp->elements[cpu].idx = cp->size - 1; +		cpudl_change_key(cp, cp->size - 1, dl); +		cpumask_clear_cpu(cpu, cp->free_cpus); +	} else { +		cpudl_change_key(cp, old_idx, dl); +	} + +out: +	raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ +	int i; + +	memset(cp, 0, sizeof(*cp)); +	raw_spin_lock_init(&cp->lock); +	cp->size = 0; + +	cp->elements = kcalloc(nr_cpu_ids, +			       sizeof(struct cpudl_item), +			       GFP_KERNEL); +	if (!cp->elements) +		return -ENOMEM; + +	if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { +		kfree(cp->elements); +		return -ENOMEM; +	} + +	for_each_possible_cpu(i) +		cp->elements[i].idx = IDX_INVALID; + +	cpumask_setall(cp->free_cpus); + +	return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ +	free_cpumask_var(cp->free_cpus); +	kfree(cp->elements); +} diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 00000000000..538c9796ad4 --- /dev/null +++ b/kernel/sched/cpudeadline.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include <linux/sched.h> + +#define IDX_INVALID     -1 + +struct cpudl_item { +	u64 dl; +	int cpu; +	int idx; +}; + +struct cpudl { +	raw_spinlock_t lock; +	int size; +	cpumask_var_t free_cpus; +	struct cpudl_item *elements; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, +	       struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d9..981fcd7dc39 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@  #include <linux/gfp.h>  #include <linux/sched.h>  #include <linux/sched/rt.h> +#include <linux/slab.h>  #include "cpupri.h"  /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,  	int idx = 0;  	int task_pri = convert_prio(p->prio); -	if (task_pri >= MAX_RT_PRIO) -		return 0; +	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);  	for (idx = 0; idx < task_pri; idx++) {  		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx]; @@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)  		 * do a write memory barrier, and then update the count, to  		 * make sure the vector is visible when count is set.  		 */ -		smp_mb__before_atomic_inc(); +		smp_mb__before_atomic();  		atomic_inc(&(vec)->count);  		do_mb = 1;  	} @@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)  		 * the new priority vec.  		 */  		if (do_mb) -			smp_mb__after_atomic_inc(); +			smp_mb__after_atomic();  		/*  		 * When removing from the vector, we decrement the counter first  		 * do a memory barrier and then clear the mask.  		 */  		atomic_dec(&(vec)->count); -		smp_mb__after_atomic_inc(); +		smp_mb__after_atomic();  		cpumask_clear_cpu(cpu, vec->mask);  	} @@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)  			goto cleanup;  	} +	cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); +	if (!cp->cpu_to_pri) +		goto cleanup; +  	for_each_possible_cpu(i)  		cp->cpu_to_pri[i] = CPUPRI_INVALID; +  	return 0;  cleanup: @@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)  {  	int i; +	kfree(cp->cpu_to_pri);  	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)  		free_cpumask_var(cp->pri_to_cpu[i].mask);  } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d75617349..6b033347fdf 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec {  struct cpupri {  	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; -	int               cpu_to_pri[NR_CPUS]; +	int *cpu_to_pri;  };  #ifdef CONFIG_SMP diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30..72fdf06ef86 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,  	p->utimescaled += cputime_scaled;  	account_group_user_time(p, cputime); -	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; +	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;  	/* Add user time to cpustat. */  	task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,  	p->gtime += cputime;  	/* Add guest time to cpustat. */ -	if (TASK_NICE(p) > 0) { +	if (task_nice(p) > 0) {  		cpustat[CPUTIME_NICE] += (__force u64) cputime;  		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;  	} else { @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)  {  #ifdef CONFIG_PARAVIRT  	if (static_key_false(¶virt_steal_enabled)) { -		u64 steal, st = 0; +		u64 steal; +		cputime_t steal_ct;  		steal = paravirt_steal_clock(smp_processor_id());  		steal -= this_rq()->prev_steal_time; -		st = steal_ticks(steal); -		this_rq()->prev_steal_time += st * TICK_NSEC; +		/* +		 * cputime_t may be less precise than nsecs (eg: if it's +		 * based on jiffies). Lets cast the result to cputime +		 * granularity and account the rest on the next rounds. +		 */ +		steal_ct = nsecs_to_cputime(steal); +		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); -		account_steal_time(st); -		return st; +		account_steal_time(steal_ct); +		return steal_ct;  	}  #endif  	return false; @@ -326,50 +332,50 @@ out:   * softirq as those do not count in task exec_runtime any more.   */  static void irqtime_account_process_tick(struct task_struct *p, int user_tick, -						struct rq *rq) +					 struct rq *rq, int ticks)  { -	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); +	u64 cputime = (__force u64) cputime_one_jiffy;  	u64 *cpustat = kcpustat_this_cpu->cpustat;  	if (steal_account_process_tick())  		return; +	cputime *= ticks; +	scaled *= ticks; +  	if (irqtime_account_hi_update()) { -		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; +		cpustat[CPUTIME_IRQ] += cputime;  	} else if (irqtime_account_si_update()) { -		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; +		cpustat[CPUTIME_SOFTIRQ] += cputime;  	} else if (this_cpu_ksoftirqd() == p) {  		/*  		 * ksoftirqd time do not get accounted in cpu_softirq_time.  		 * So, we have to handle it separately here.  		 * Also, p->stime needs to be updated for ksoftirqd.  		 */ -		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, -					CPUTIME_SOFTIRQ); +		__account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);  	} else if (user_tick) { -		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +		account_user_time(p, cputime, scaled);  	} else if (p == rq->idle) { -		account_idle_time(cputime_one_jiffy); +		account_idle_time(cputime);  	} else if (p->flags & PF_VCPU) { /* System time or guest time */ -		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); +		account_guest_time(p, cputime, scaled);  	} else { -		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, -					CPUTIME_SYSTEM); +		__account_system_time(p, cputime, scaled,	CPUTIME_SYSTEM);  	}  }  static void irqtime_account_idle_ticks(int ticks)  { -	int i;  	struct rq *rq = this_rq(); -	for (i = 0; i < ticks; i++) -		irqtime_account_process_tick(current, 0, rq); +	irqtime_account_process_tick(current, 0, rq, ticks);  }  #else /* CONFIG_IRQ_TIME_ACCOUNTING */  static inline void irqtime_account_idle_ticks(int ticks) {}  static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, -						struct rq *rq) {} +						struct rq *rq, int nr_ticks) {}  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */  /* @@ -458,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)  		return;  	if (sched_clock_irqtime) { -		irqtime_account_process_tick(p, user_tick, rq); +		irqtime_account_process_tick(p, user_tick, rq, 1);  		return;  	} diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 00000000000..fc4f98b1258 --- /dev/null +++ b/kernel/sched/deadline.c @@ -0,0 +1,1676 @@ +/* + * Deadline Scheduling Class (SCHED_DEADLINE) + * + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). + * + * Tasks that periodically executes their instances for less than their + * runtime won't miss any of their deadlines. + * Tasks that are not periodic or sporadic or that tries to execute more + * than their reserved bandwidth will be slowed down (and may potentially + * miss some of their deadlines), and won't affect any other task. + * + * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, + *                    Juri Lelli <juri.lelli@gmail.com>, + *                    Michael Trimarchi <michael@amarulasolutions.com>, + *                    Fabio Checconi <fchecconi@gmail.com> + */ +#include "sched.h" + +#include <linux/slab.h> + +struct dl_bandwidth def_dl_bandwidth; + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ +	return container_of(dl_se, struct task_struct, dl); +} + +static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) +{ +	return container_of(dl_rq, struct rq, dl); +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ +	struct task_struct *p = dl_task_of(dl_se); +	struct rq *rq = task_rq(p); + +	return &rq->dl; +} + +static inline int on_dl_rq(struct sched_dl_entity *dl_se) +{ +	return !RB_EMPTY_NODE(&dl_se->rb_node); +} + +static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +{ +	struct sched_dl_entity *dl_se = &p->dl; + +	return dl_rq->rb_leftmost == &dl_se->rb_node; +} + +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +{ +	raw_spin_lock_init(&dl_b->dl_runtime_lock); +	dl_b->dl_period = period; +	dl_b->dl_runtime = runtime; +} + +void init_dl_bw(struct dl_bw *dl_b) +{ +	raw_spin_lock_init(&dl_b->lock); +	raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); +	if (global_rt_runtime() == RUNTIME_INF) +		dl_b->bw = -1; +	else +		dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); +	raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); +	dl_b->total_bw = 0; +} + +void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +{ +	dl_rq->rb_root = RB_ROOT; + +#ifdef CONFIG_SMP +	/* zero means no -deadline tasks */ +	dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; + +	dl_rq->dl_nr_migratory = 0; +	dl_rq->overloaded = 0; +	dl_rq->pushable_dl_tasks_root = RB_ROOT; +#else +	init_dl_bw(&dl_rq->dl_bw); +#endif +} + +#ifdef CONFIG_SMP + +static inline int dl_overloaded(struct rq *rq) +{ +	return atomic_read(&rq->rd->dlo_count); +} + +static inline void dl_set_overload(struct rq *rq) +{ +	if (!rq->online) +		return; + +	cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); +	/* +	 * Must be visible before the overload count is +	 * set (as in sched_rt.c). +	 * +	 * Matched by the barrier in pull_dl_task(). +	 */ +	smp_wmb(); +	atomic_inc(&rq->rd->dlo_count); +} + +static inline void dl_clear_overload(struct rq *rq) +{ +	if (!rq->online) +		return; + +	atomic_dec(&rq->rd->dlo_count); +	cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); +} + +static void update_dl_migration(struct dl_rq *dl_rq) +{ +	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { +		if (!dl_rq->overloaded) { +			dl_set_overload(rq_of_dl_rq(dl_rq)); +			dl_rq->overloaded = 1; +		} +	} else if (dl_rq->overloaded) { +		dl_clear_overload(rq_of_dl_rq(dl_rq)); +		dl_rq->overloaded = 0; +	} +} + +static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	struct task_struct *p = dl_task_of(dl_se); + +	if (p->nr_cpus_allowed > 1) +		dl_rq->dl_nr_migratory++; + +	update_dl_migration(dl_rq); +} + +static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	struct task_struct *p = dl_task_of(dl_se); + +	if (p->nr_cpus_allowed > 1) +		dl_rq->dl_nr_migratory--; + +	update_dl_migration(dl_rq); +} + +/* + * The list of pushable -deadline task is not a plist, like in + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. + */ +static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +	struct dl_rq *dl_rq = &rq->dl; +	struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; +	struct rb_node *parent = NULL; +	struct task_struct *entry; +	int leftmost = 1; + +	BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + +	while (*link) { +		parent = *link; +		entry = rb_entry(parent, struct task_struct, +				 pushable_dl_tasks); +		if (dl_entity_preempt(&p->dl, &entry->dl)) +			link = &parent->rb_left; +		else { +			link = &parent->rb_right; +			leftmost = 0; +		} +	} + +	if (leftmost) +		dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + +	rb_link_node(&p->pushable_dl_tasks, parent, link); +	rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +} + +static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +	struct dl_rq *dl_rq = &rq->dl; + +	if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) +		return; + +	if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { +		struct rb_node *next_node; + +		next_node = rb_next(&p->pushable_dl_tasks); +		dl_rq->pushable_dl_tasks_leftmost = next_node; +	} + +	rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +	RB_CLEAR_NODE(&p->pushable_dl_tasks); +} + +static inline int has_pushable_dl_tasks(struct rq *rq) +{ +	return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); +} + +static int push_dl_task(struct rq *rq); + +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ +	return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ +	rq->post_schedule = has_pushable_dl_tasks(rq); +} + +#else + +static inline +void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline +void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ +	return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ +	return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} +#endif /* CONFIG_SMP */ + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +				  int flags); + +/* + * We are being explicitly informed that a new instance is starting, + * and this means that: + *  - the absolute deadline of the entity has to be placed at + *    current time + relative deadline; + *  - the runtime of the entity has to be set to the maximum value. + * + * The capability of specifying such event is useful whenever a -deadline + * entity wants to (try to!) synchronize its behaviour with the scheduler's + * one, and to (try to!) reconcile itself with its own scheduling + * parameters. + */ +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, +				       struct sched_dl_entity *pi_se) +{ +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se); +	struct rq *rq = rq_of_dl_rq(dl_rq); + +	WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + +	/* +	 * We use the regular wall clock time to set deadlines in the +	 * future; in fact, we must consider execution overheads (time +	 * spent on hardirq context, etc.). +	 */ +	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; +	dl_se->runtime = pi_se->dl_runtime; +	dl_se->dl_new = 0; +} + +/* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus + * exhausting its runtime. + * + * Here we are interested in making runtime overrun possible, but we do + * not want a entity which is misbehaving to affect the scheduling of all + * other entities. + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) + * is used, in order to confine each entity within its own bandwidth. + * + * This function deals exactly with that, and ensures that when the runtime + * of a entity is replenished, its deadline is also postponed. That ensures + * the overrunning entity can't interfere with other entity in the system and + * can't make them miss their deadlines. Reasons why this kind of overruns + * could happen are, typically, a entity voluntarily trying to overcome its + * runtime, or it just underestimated it during sched_setscheduler_ex(). + */ +static void replenish_dl_entity(struct sched_dl_entity *dl_se, +				struct sched_dl_entity *pi_se) +{ +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se); +	struct rq *rq = rq_of_dl_rq(dl_rq); + +	BUG_ON(pi_se->dl_runtime <= 0); + +	/* +	 * This could be the case for a !-dl task that is boosted. +	 * Just go with full inherited parameters. +	 */ +	if (dl_se->dl_deadline == 0) { +		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; +		dl_se->runtime = pi_se->dl_runtime; +	} + +	/* +	 * We keep moving the deadline away until we get some +	 * available runtime for the entity. This ensures correct +	 * handling of situations where the runtime overrun is +	 * arbitrary large. +	 */ +	while (dl_se->runtime <= 0) { +		dl_se->deadline += pi_se->dl_period; +		dl_se->runtime += pi_se->dl_runtime; +	} + +	/* +	 * At this point, the deadline really should be "in +	 * the future" with respect to rq->clock. If it's +	 * not, we are, for some reason, lagging too much! +	 * Anyway, after having warn userspace abut that, +	 * we still try to keep the things running by +	 * resetting the deadline and the budget of the +	 * entity. +	 */ +	if (dl_time_before(dl_se->deadline, rq_clock(rq))) { +		printk_deferred_once("sched: DL replenish lagged to much\n"); +		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; +		dl_se->runtime = pi_se->dl_runtime; +	} +} + +/* + * Here we check if --at time t-- an entity (which is probably being + * [re]activated or, in general, enqueued) can use its remaining runtime + * and its current deadline _without_ exceeding the bandwidth it is + * assigned (function returns true if it can't). We are in fact applying + * one of the CBS rules: when a task wakes up, if the residual runtime + * over residual deadline fits within the allocated bandwidth, then we + * can keep the current (absolute) deadline and residual budget without + * disrupting the schedulability of the system. Otherwise, we should + * refill the runtime and set the deadline a period in the future, + * because keeping the current (absolute) deadline of the task would + * result in breaking guarantees promised to other tasks (refer to + * Documentation/scheduler/sched-deadline.txt for more informations). + * + * This function returns true if: + * + *   runtime / (deadline - t) > dl_runtime / dl_period , + * + * IOW we can't recycle current parameters. + * + * Notice that the bandwidth check is done against the period. For + * task with deadline equal to period this is the same of using + * dl_deadline instead of dl_period in the equation above. + */ +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, +			       struct sched_dl_entity *pi_se, u64 t) +{ +	u64 left, right; + +	/* +	 * left and right are the two sides of the equation above, +	 * after a bit of shuffling to use multiplications instead +	 * of divisions. +	 * +	 * Note that none of the time values involved in the two +	 * multiplications are absolute: dl_deadline and dl_runtime +	 * are the relative deadline and the maximum runtime of each +	 * instance, runtime is the runtime left for the last instance +	 * and (deadline - t), since t is rq->clock, is the time left +	 * to the (absolute) deadline. Even if overflowing the u64 type +	 * is very unlikely to occur in both cases, here we scale down +	 * as we want to avoid that risk at all. Scaling down by 10 +	 * means that we reduce granularity to 1us. We are fine with it, +	 * since this is only a true/false check and, anyway, thinking +	 * of anything below microseconds resolution is actually fiction +	 * (but still we want to give the user that illusion >;). +	 */ +	left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); +	right = ((dl_se->deadline - t) >> DL_SCALE) * +		(pi_se->dl_runtime >> DL_SCALE); + +	return dl_time_before(right, left); +} + +/* + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. + * + * The policy here is that we update the deadline of the entity only if: + *  - the current deadline is in the past, + *  - using the remaining runtime with the current deadline would make + *    the entity exceed its bandwidth. + */ +static void update_dl_entity(struct sched_dl_entity *dl_se, +			     struct sched_dl_entity *pi_se) +{ +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se); +	struct rq *rq = rq_of_dl_rq(dl_rq); + +	/* +	 * The arrival of a new instance needs special treatment, i.e., +	 * the actual scheduling parameters have to be "renewed". +	 */ +	if (dl_se->dl_new) { +		setup_new_dl_entity(dl_se, pi_se); +		return; +	} + +	if (dl_time_before(dl_se->deadline, rq_clock(rq)) || +	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { +		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; +		dl_se->runtime = pi_se->dl_runtime; +	} +} + +/* + * If the entity depleted all its runtime, and if we want it to sleep + * while waiting for some new execution time to become available, we + * set the bandwidth enforcement timer to the replenishment instant + * and try to activate it. + * + * Notice that it is important for the caller to know if the timer + * actually started or not (i.e., the replenishment instant is in + * the future or in the past). + */ +static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +{ +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se); +	struct rq *rq = rq_of_dl_rq(dl_rq); +	ktime_t now, act; +	ktime_t soft, hard; +	unsigned long range; +	s64 delta; + +	if (boosted) +		return 0; +	/* +	 * We want the timer to fire at the deadline, but considering +	 * that it is actually coming from rq->clock and not from +	 * hrtimer's time base reading. +	 */ +	act = ns_to_ktime(dl_se->deadline); +	now = hrtimer_cb_get_time(&dl_se->dl_timer); +	delta = ktime_to_ns(now) - rq_clock(rq); +	act = ktime_add_ns(act, delta); + +	/* +	 * If the expiry time already passed, e.g., because the value +	 * chosen as the deadline is too small, don't even try to +	 * start the timer in the past! +	 */ +	if (ktime_us_delta(act, now) < 0) +		return 0; + +	hrtimer_set_expires(&dl_se->dl_timer, act); + +	soft = hrtimer_get_softexpires(&dl_se->dl_timer); +	hard = hrtimer_get_expires(&dl_se->dl_timer); +	range = ktime_to_ns(ktime_sub(hard, soft)); +	__hrtimer_start_range_ns(&dl_se->dl_timer, soft, +				 range, HRTIMER_MODE_ABS, 0); + +	return hrtimer_active(&dl_se->dl_timer); +} + +/* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running + * means the task is throttled and needs a runtime replenishment. + * + * However, what we actually do depends on the fact the task is active, + * (it is on its rq) or has been removed from there by a call to + * dequeue_task_dl(). In the former case we must issue the runtime + * replenishment and add the task back to the dl_rq; in the latter, we just + * do nothing but clearing dl_throttled, so that runtime and deadline + * updating (and the queueing back to dl_rq) will be done by the + * next call to enqueue_task_dl(). + */ +static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) +{ +	struct sched_dl_entity *dl_se = container_of(timer, +						     struct sched_dl_entity, +						     dl_timer); +	struct task_struct *p = dl_task_of(dl_se); +	struct rq *rq; +again: +	rq = task_rq(p); +	raw_spin_lock(&rq->lock); + +	if (rq != task_rq(p)) { +		/* Task was moved, retrying. */ +		raw_spin_unlock(&rq->lock); +		goto again; +	} + +	/* +	 * We need to take care of a possible races here. In fact, the +	 * task might have changed its scheduling policy to something +	 * different from SCHED_DEADLINE or changed its reservation +	 * parameters (through sched_setattr()). +	 */ +	if (!dl_task(p) || dl_se->dl_new) +		goto unlock; + +	sched_clock_tick(); +	update_rq_clock(rq); +	dl_se->dl_throttled = 0; +	dl_se->dl_yielded = 0; +	if (p->on_rq) { +		enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); +		if (task_has_dl_policy(rq->curr)) +			check_preempt_curr_dl(rq, p, 0); +		else +			resched_task(rq->curr); +#ifdef CONFIG_SMP +		/* +		 * Queueing this task back might have overloaded rq, +		 * check if we need to kick someone away. +		 */ +		if (has_pushable_dl_tasks(rq)) +			push_dl_task(rq); +#endif +	} +unlock: +	raw_spin_unlock(&rq->lock); + +	return HRTIMER_NORESTART; +} + +void init_dl_task_timer(struct sched_dl_entity *dl_se) +{ +	struct hrtimer *timer = &dl_se->dl_timer; + +	if (hrtimer_active(timer)) { +		hrtimer_try_to_cancel(timer); +		return; +	} + +	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	timer->function = dl_task_timer; +} + +static +int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +{ +	int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); +	int rorun = dl_se->runtime <= 0; + +	if (!rorun && !dmiss) +		return 0; + +	/* +	 * If we are beyond our current deadline and we are still +	 * executing, then we have already used some of the runtime of +	 * the next instance. Thus, if we do not account that, we are +	 * stealing bandwidth from the system at each deadline miss! +	 */ +	if (dmiss) { +		dl_se->runtime = rorun ? dl_se->runtime : 0; +		dl_se->runtime -= rq_clock(rq) - dl_se->deadline; +	} + +	return 1; +} + +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ +	struct task_struct *curr = rq->curr; +	struct sched_dl_entity *dl_se = &curr->dl; +	u64 delta_exec; + +	if (!dl_task(curr) || !on_dl_rq(dl_se)) +		return; + +	/* +	 * Consumed budget is computed considering the time as +	 * observed by schedulable tasks (excluding time spent +	 * in hardirq context, etc.). Deadlines are instead +	 * computed using hard walltime. This seems to be the more +	 * natural solution, but the full ramifications of this +	 * approach need further study. +	 */ +	delta_exec = rq_clock_task(rq) - curr->se.exec_start; +	if (unlikely((s64)delta_exec <= 0)) +		return; + +	schedstat_set(curr->se.statistics.exec_max, +		      max(curr->se.statistics.exec_max, delta_exec)); + +	curr->se.sum_exec_runtime += delta_exec; +	account_group_exec_runtime(curr, delta_exec); + +	curr->se.exec_start = rq_clock_task(rq); +	cpuacct_charge(curr, delta_exec); + +	sched_rt_avg_update(rq, delta_exec); + +	dl_se->runtime -= delta_exec; +	if (dl_runtime_exceeded(rq, dl_se)) { +		__dequeue_task_dl(rq, curr, 0); +		if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) +			dl_se->dl_throttled = 1; +		else +			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + +		if (!is_leftmost(curr, &rq->dl)) +			resched_task(curr); +	} + +	/* +	 * Because -- for now -- we share the rt bandwidth, we need to +	 * account our runtime there too, otherwise actual rt tasks +	 * would be able to exceed the shared quota. +	 * +	 * Account to the root rt group for now. +	 * +	 * The solution we're working towards is having the RT groups scheduled +	 * using deadline servers -- however there's a few nasties to figure +	 * out before that can happen. +	 */ +	if (rt_bandwidth_enabled()) { +		struct rt_rq *rt_rq = &rq->rt; + +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		/* +		 * We'll let actual RT tasks worry about the overflow here, we +		 * have our own CBS to keep us inline; only account when RT +		 * bandwidth is relevant. +		 */ +		if (sched_rt_bandwidth_account(rt_rq)) +			rt_rq->rt_time += delta_exec; +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +	} +} + +#ifdef CONFIG_SMP + +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); + +static inline u64 next_deadline(struct rq *rq) +{ +	struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); + +	if (next && dl_prio(next->prio)) +		return next->dl.deadline; +	else +		return 0; +} + +static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ +	struct rq *rq = rq_of_dl_rq(dl_rq); + +	if (dl_rq->earliest_dl.curr == 0 || +	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) { +		/* +		 * If the dl_rq had no -deadline tasks, or if the new task +		 * has shorter deadline than the current one on dl_rq, we +		 * know that the previous earliest becomes our next earliest, +		 * as the new task becomes the earliest itself. +		 */ +		dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; +		dl_rq->earliest_dl.curr = deadline; +		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); +	} else if (dl_rq->earliest_dl.next == 0 || +		   dl_time_before(deadline, dl_rq->earliest_dl.next)) { +		/* +		 * On the other hand, if the new -deadline task has a +		 * a later deadline than the earliest one on dl_rq, but +		 * it is earlier than the next (if any), we must +		 * recompute the next-earliest. +		 */ +		dl_rq->earliest_dl.next = next_deadline(rq); +	} +} + +static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ +	struct rq *rq = rq_of_dl_rq(dl_rq); + +	/* +	 * Since we may have removed our earliest (and/or next earliest) +	 * task we must recompute them. +	 */ +	if (!dl_rq->dl_nr_running) { +		dl_rq->earliest_dl.curr = 0; +		dl_rq->earliest_dl.next = 0; +		cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); +	} else { +		struct rb_node *leftmost = dl_rq->rb_leftmost; +		struct sched_dl_entity *entry; + +		entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); +		dl_rq->earliest_dl.curr = entry->deadline; +		dl_rq->earliest_dl.next = next_deadline(rq); +		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); +	} +} + +#else + +static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} +static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} + +#endif /* CONFIG_SMP */ + +static inline +void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	int prio = dl_task_of(dl_se)->prio; +	u64 deadline = dl_se->deadline; + +	WARN_ON(!dl_prio(prio)); +	dl_rq->dl_nr_running++; +	add_nr_running(rq_of_dl_rq(dl_rq), 1); + +	inc_dl_deadline(dl_rq, deadline); +	inc_dl_migration(dl_se, dl_rq); +} + +static inline +void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +	int prio = dl_task_of(dl_se)->prio; + +	WARN_ON(!dl_prio(prio)); +	WARN_ON(!dl_rq->dl_nr_running); +	dl_rq->dl_nr_running--; +	sub_nr_running(rq_of_dl_rq(dl_rq), 1); + +	dec_dl_deadline(dl_rq, dl_se->deadline); +	dec_dl_migration(dl_se, dl_rq); +} + +static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) +{ +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se); +	struct rb_node **link = &dl_rq->rb_root.rb_node; +	struct rb_node *parent = NULL; +	struct sched_dl_entity *entry; +	int leftmost = 1; + +	BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + +	while (*link) { +		parent = *link; +		entry = rb_entry(parent, struct sched_dl_entity, rb_node); +		if (dl_time_before(dl_se->deadline, entry->deadline)) +			link = &parent->rb_left; +		else { +			link = &parent->rb_right; +			leftmost = 0; +		} +	} + +	if (leftmost) +		dl_rq->rb_leftmost = &dl_se->rb_node; + +	rb_link_node(&dl_se->rb_node, parent, link); +	rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + +	inc_dl_tasks(dl_se, dl_rq); +} + +static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ +	struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + +	if (RB_EMPTY_NODE(&dl_se->rb_node)) +		return; + +	if (dl_rq->rb_leftmost == &dl_se->rb_node) { +		struct rb_node *next_node; + +		next_node = rb_next(&dl_se->rb_node); +		dl_rq->rb_leftmost = next_node; +	} + +	rb_erase(&dl_se->rb_node, &dl_rq->rb_root); +	RB_CLEAR_NODE(&dl_se->rb_node); + +	dec_dl_tasks(dl_se, dl_rq); +} + +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, +		  struct sched_dl_entity *pi_se, int flags) +{ +	BUG_ON(on_dl_rq(dl_se)); + +	/* +	 * If this is a wakeup or a new instance, the scheduling +	 * parameters of the task might need updating. Otherwise, +	 * we want a replenishment of its runtime. +	 */ +	if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) +		replenish_dl_entity(dl_se, pi_se); +	else +		update_dl_entity(dl_se, pi_se); + +	__enqueue_dl_entity(dl_se); +} + +static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ +	__dequeue_dl_entity(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ +	struct task_struct *pi_task = rt_mutex_get_top_task(p); +	struct sched_dl_entity *pi_se = &p->dl; + +	/* +	 * Use the scheduling parameters of the top pi-waiter +	 * task if we have one and its (relative) deadline is +	 * smaller than our one... OTW we keep our runtime and +	 * deadline. +	 */ +	if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) +		pi_se = &pi_task->dl; + +	/* +	 * If p is throttled, we do nothing. In fact, if it exhausted +	 * its budget it needs a replenishment and, since it now is on +	 * its rq, the bandwidth timer callback (which clearly has not +	 * run yet) will take care of this. +	 */ +	if (p->dl.dl_throttled) +		return; + +	enqueue_dl_entity(&p->dl, pi_se, flags); + +	if (!task_current(rq, p) && p->nr_cpus_allowed > 1) +		enqueue_pushable_dl_task(rq, p); +} + +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ +	dequeue_dl_entity(&p->dl); +	dequeue_pushable_dl_task(rq, p); +} + +static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ +	update_curr_dl(rq); +	__dequeue_task_dl(rq, p, flags); +} + +/* + * Yield task semantic for -deadline tasks is: + * + *   get off from the CPU until our next instance, with + *   a new runtime. This is of little use now, since we + *   don't have a bandwidth reclaiming mechanism. Anyway, + *   bandwidth reclaiming is planned for the future, and + *   yield_task_dl will indicate that some spare budget + *   is available for other task instances to use it. + */ +static void yield_task_dl(struct rq *rq) +{ +	struct task_struct *p = rq->curr; + +	/* +	 * We make the task go to sleep until its current deadline by +	 * forcing its runtime to zero. This way, update_curr_dl() stops +	 * it and the bandwidth timer will wake it up and will give it +	 * new scheduling parameters (thanks to dl_yielded=1). +	 */ +	if (p->dl.runtime > 0) { +		rq->curr->dl.dl_yielded = 1; +		p->dl.runtime = 0; +	} +	update_curr_dl(rq); +} + +#ifdef CONFIG_SMP + +static int find_later_rq(struct task_struct *task); + +static int +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +{ +	struct task_struct *curr; +	struct rq *rq; + +	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) +		goto out; + +	rq = cpu_rq(cpu); + +	rcu_read_lock(); +	curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + +	/* +	 * If we are dealing with a -deadline task, we must +	 * decide where to wake it up. +	 * If it has a later deadline and the current task +	 * on this rq can't move (provided the waking task +	 * can!) we prefer to send it somewhere else. On the +	 * other hand, if it has a shorter deadline, we +	 * try to make it stay here, it might be important. +	 */ +	if (unlikely(dl_task(curr)) && +	    (curr->nr_cpus_allowed < 2 || +	     !dl_entity_preempt(&p->dl, &curr->dl)) && +	    (p->nr_cpus_allowed > 1)) { +		int target = find_later_rq(p); + +		if (target != -1) +			cpu = target; +	} +	rcu_read_unlock(); + +out: +	return cpu; +} + +static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) +{ +	/* +	 * Current can't be migrated, useless to reschedule, +	 * let's hope p can move out. +	 */ +	if (rq->curr->nr_cpus_allowed == 1 || +	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) +		return; + +	/* +	 * p is migratable, so let's not schedule it and +	 * see if it is pushed or pulled somewhere else. +	 */ +	if (p->nr_cpus_allowed != 1 && +	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1) +		return; + +	resched_task(rq->curr); +} + +static int pull_dl_task(struct rq *this_rq); + +#endif /* CONFIG_SMP */ + +/* + * Only called when both the current and waking task are -deadline + * tasks. + */ +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +				  int flags) +{ +	if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { +		resched_task(rq->curr); +		return; +	} + +#ifdef CONFIG_SMP +	/* +	 * In the unlikely case current and p have the same deadline +	 * let us try to decide what's the best thing to do... +	 */ +	if ((p->dl.deadline == rq->curr->dl.deadline) && +	    !test_tsk_need_resched(rq->curr)) +		check_preempt_equal_dl(rq, p); +#endif /* CONFIG_SMP */ +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ +	s64 delta = p->dl.dl_runtime - p->dl.runtime; + +	if (delta > 10000) +		hrtick_start(rq, p->dl.runtime); +} +#endif + +static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, +						   struct dl_rq *dl_rq) +{ +	struct rb_node *left = dl_rq->rb_leftmost; + +	if (!left) +		return NULL; + +	return rb_entry(left, struct sched_dl_entity, rb_node); +} + +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +{ +	struct sched_dl_entity *dl_se; +	struct task_struct *p; +	struct dl_rq *dl_rq; + +	dl_rq = &rq->dl; + +	if (need_pull_dl_task(rq, prev)) { +		pull_dl_task(rq); +		/* +		 * pull_rt_task() can drop (and re-acquire) rq->lock; this +		 * means a stop task can slip in, in which case we need to +		 * re-start task selection. +		 */ +		if (rq->stop && rq->stop->on_rq) +			return RETRY_TASK; +	} + +	/* +	 * When prev is DL, we may throttle it in put_prev_task(). +	 * So, we update time before we check for dl_nr_running. +	 */ +	if (prev->sched_class == &dl_sched_class) +		update_curr_dl(rq); + +	if (unlikely(!dl_rq->dl_nr_running)) +		return NULL; + +	put_prev_task(rq, prev); + +	dl_se = pick_next_dl_entity(rq, dl_rq); +	BUG_ON(!dl_se); + +	p = dl_task_of(dl_se); +	p->se.exec_start = rq_clock_task(rq); + +	/* Running task will never be pushed. */ +       dequeue_pushable_dl_task(rq, p); + +#ifdef CONFIG_SCHED_HRTICK +	if (hrtick_enabled(rq)) +		start_hrtick_dl(rq, p); +#endif + +	set_post_schedule(rq); + +	return p; +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +{ +	update_curr_dl(rq); + +	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) +		enqueue_pushable_dl_task(rq, p); +} + +static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) +{ +	update_curr_dl(rq); + +#ifdef CONFIG_SCHED_HRTICK +	if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) +		start_hrtick_dl(rq, p); +#endif +} + +static void task_fork_dl(struct task_struct *p) +{ +	/* +	 * SCHED_DEADLINE tasks cannot fork and this is achieved through +	 * sched_fork() +	 */ +} + +static void task_dead_dl(struct task_struct *p) +{ +	struct hrtimer *timer = &p->dl.dl_timer; +	struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + +	/* +	 * Since we are TASK_DEAD we won't slip out of the domain! +	 */ +	raw_spin_lock_irq(&dl_b->lock); +	dl_b->total_bw -= p->dl.dl_bw; +	raw_spin_unlock_irq(&dl_b->lock); + +	hrtimer_cancel(timer); +} + +static void set_curr_task_dl(struct rq *rq) +{ +	struct task_struct *p = rq->curr; + +	p->se.exec_start = rq_clock_task(rq); + +	/* You can't push away the running task */ +	dequeue_pushable_dl_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define DL_MAX_TRIES 3 + +static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) +{ +	if (!task_running(rq, p) && +	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && +	    (p->nr_cpus_allowed > 1)) +		return 1; + +	return 0; +} + +/* Returns the second earliest -deadline task, NULL otherwise */ +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) +{ +	struct rb_node *next_node = rq->dl.rb_leftmost; +	struct sched_dl_entity *dl_se; +	struct task_struct *p = NULL; + +next_node: +	next_node = rb_next(next_node); +	if (next_node) { +		dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); +		p = dl_task_of(dl_se); + +		if (pick_dl_task(rq, p, cpu)) +			return p; + +		goto next_node; +	} + +	return NULL; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); + +static int find_later_rq(struct task_struct *task) +{ +	struct sched_domain *sd; +	struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); +	int this_cpu = smp_processor_id(); +	int best_cpu, cpu = task_cpu(task); + +	/* Make sure the mask is initialized first */ +	if (unlikely(!later_mask)) +		return -1; + +	if (task->nr_cpus_allowed == 1) +		return -1; + +	best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, +			task, later_mask); +	if (best_cpu == -1) +		return -1; + +	/* +	 * If we are here, some target has been found, +	 * the most suitable of which is cached in best_cpu. +	 * This is, among the runqueues where the current tasks +	 * have later deadlines than the task's one, the rq +	 * with the latest possible one. +	 * +	 * Now we check how well this matches with task's +	 * affinity and system topology. +	 * +	 * The last cpu where the task run is our first +	 * guess, since it is most likely cache-hot there. +	 */ +	if (cpumask_test_cpu(cpu, later_mask)) +		return cpu; +	/* +	 * Check if this_cpu is to be skipped (i.e., it is +	 * not in the mask) or not. +	 */ +	if (!cpumask_test_cpu(this_cpu, later_mask)) +		this_cpu = -1; + +	rcu_read_lock(); +	for_each_domain(cpu, sd) { +		if (sd->flags & SD_WAKE_AFFINE) { + +			/* +			 * If possible, preempting this_cpu is +			 * cheaper than migrating. +			 */ +			if (this_cpu != -1 && +			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { +				rcu_read_unlock(); +				return this_cpu; +			} + +			/* +			 * Last chance: if best_cpu is valid and is +			 * in the mask, that becomes our choice. +			 */ +			if (best_cpu < nr_cpu_ids && +			    cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { +				rcu_read_unlock(); +				return best_cpu; +			} +		} +	} +	rcu_read_unlock(); + +	/* +	 * At this point, all our guesses failed, we just return +	 * 'something', and let the caller sort the things out. +	 */ +	if (this_cpu != -1) +		return this_cpu; + +	cpu = cpumask_any(later_mask); +	if (cpu < nr_cpu_ids) +		return cpu; + +	return -1; +} + +/* Locks the rq it finds */ +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) +{ +	struct rq *later_rq = NULL; +	int tries; +	int cpu; + +	for (tries = 0; tries < DL_MAX_TRIES; tries++) { +		cpu = find_later_rq(task); + +		if ((cpu == -1) || (cpu == rq->cpu)) +			break; + +		later_rq = cpu_rq(cpu); + +		/* Retry if something changed. */ +		if (double_lock_balance(rq, later_rq)) { +			if (unlikely(task_rq(task) != rq || +				     !cpumask_test_cpu(later_rq->cpu, +				                       &task->cpus_allowed) || +				     task_running(rq, task) || !task->on_rq)) { +				double_unlock_balance(rq, later_rq); +				later_rq = NULL; +				break; +			} +		} + +		/* +		 * If the rq we found has no -deadline task, or +		 * its earliest one has a later deadline than our +		 * task, the rq is a good one. +		 */ +		if (!later_rq->dl.dl_nr_running || +		    dl_time_before(task->dl.deadline, +				   later_rq->dl.earliest_dl.curr)) +			break; + +		/* Otherwise we try again. */ +		double_unlock_balance(rq, later_rq); +		later_rq = NULL; +	} + +	return later_rq; +} + +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ +	struct task_struct *p; + +	if (!has_pushable_dl_tasks(rq)) +		return NULL; + +	p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, +		     struct task_struct, pushable_dl_tasks); + +	BUG_ON(rq->cpu != task_cpu(p)); +	BUG_ON(task_current(rq, p)); +	BUG_ON(p->nr_cpus_allowed <= 1); + +	BUG_ON(!p->on_rq); +	BUG_ON(!dl_task(p)); + +	return p; +} + +/* + * See if the non running -deadline tasks on this rq + * can be sent to some other CPU where they can preempt + * and start executing. + */ +static int push_dl_task(struct rq *rq) +{ +	struct task_struct *next_task; +	struct rq *later_rq; + +	if (!rq->dl.overloaded) +		return 0; + +	next_task = pick_next_pushable_dl_task(rq); +	if (!next_task) +		return 0; + +retry: +	if (unlikely(next_task == rq->curr)) { +		WARN_ON(1); +		return 0; +	} + +	/* +	 * If next_task preempts rq->curr, and rq->curr +	 * can move away, it makes sense to just reschedule +	 * without going further in pushing next_task. +	 */ +	if (dl_task(rq->curr) && +	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && +	    rq->curr->nr_cpus_allowed > 1) { +		resched_task(rq->curr); +		return 0; +	} + +	/* We might release rq lock */ +	get_task_struct(next_task); + +	/* Will lock the rq it'll find */ +	later_rq = find_lock_later_rq(next_task, rq); +	if (!later_rq) { +		struct task_struct *task; + +		/* +		 * We must check all this again, since +		 * find_lock_later_rq releases rq->lock and it is +		 * then possible that next_task has migrated. +		 */ +		task = pick_next_pushable_dl_task(rq); +		if (task_cpu(next_task) == rq->cpu && task == next_task) { +			/* +			 * The task is still there. We don't try +			 * again, some other cpu will pull it when ready. +			 */ +			dequeue_pushable_dl_task(rq, next_task); +			goto out; +		} + +		if (!task) +			/* No more tasks */ +			goto out; + +		put_task_struct(next_task); +		next_task = task; +		goto retry; +	} + +	deactivate_task(rq, next_task, 0); +	set_task_cpu(next_task, later_rq->cpu); +	activate_task(later_rq, next_task, 0); + +	resched_task(later_rq->curr); + +	double_unlock_balance(rq, later_rq); + +out: +	put_task_struct(next_task); + +	return 1; +} + +static void push_dl_tasks(struct rq *rq) +{ +	/* Terminates as it moves a -deadline task */ +	while (push_dl_task(rq)) +		; +} + +static int pull_dl_task(struct rq *this_rq) +{ +	int this_cpu = this_rq->cpu, ret = 0, cpu; +	struct task_struct *p; +	struct rq *src_rq; +	u64 dmin = LONG_MAX; + +	if (likely(!dl_overloaded(this_rq))) +		return 0; + +	/* +	 * Match the barrier from dl_set_overloaded; this guarantees that if we +	 * see overloaded we must also see the dlo_mask bit. +	 */ +	smp_rmb(); + +	for_each_cpu(cpu, this_rq->rd->dlo_mask) { +		if (this_cpu == cpu) +			continue; + +		src_rq = cpu_rq(cpu); + +		/* +		 * It looks racy, abd it is! However, as in sched_rt.c, +		 * we are fine with this. +		 */ +		if (this_rq->dl.dl_nr_running && +		    dl_time_before(this_rq->dl.earliest_dl.curr, +				   src_rq->dl.earliest_dl.next)) +			continue; + +		/* Might drop this_rq->lock */ +		double_lock_balance(this_rq, src_rq); + +		/* +		 * If there are no more pullable tasks on the +		 * rq, we're done with it. +		 */ +		if (src_rq->dl.dl_nr_running <= 1) +			goto skip; + +		p = pick_next_earliest_dl_task(src_rq, this_cpu); + +		/* +		 * We found a task to be pulled if: +		 *  - it preempts our current (if there's one), +		 *  - it will preempt the last one we pulled (if any). +		 */ +		if (p && dl_time_before(p->dl.deadline, dmin) && +		    (!this_rq->dl.dl_nr_running || +		     dl_time_before(p->dl.deadline, +				    this_rq->dl.earliest_dl.curr))) { +			WARN_ON(p == src_rq->curr); +			WARN_ON(!p->on_rq); + +			/* +			 * Then we pull iff p has actually an earlier +			 * deadline than the current task of its runqueue. +			 */ +			if (dl_time_before(p->dl.deadline, +					   src_rq->curr->dl.deadline)) +				goto skip; + +			ret = 1; + +			deactivate_task(src_rq, p, 0); +			set_task_cpu(p, this_cpu); +			activate_task(this_rq, p, 0); +			dmin = p->dl.deadline; + +			/* Is there any other task even earlier? */ +		} +skip: +		double_unlock_balance(this_rq, src_rq); +	} + +	return ret; +} + +static void post_schedule_dl(struct rq *rq) +{ +	push_dl_tasks(rq); +} + +/* + * Since the task is not running and a reschedule is not going to happen + * anytime soon on its runqueue, we try pushing it away now. + */ +static void task_woken_dl(struct rq *rq, struct task_struct *p) +{ +	if (!task_running(rq, p) && +	    !test_tsk_need_resched(rq->curr) && +	    has_pushable_dl_tasks(rq) && +	    p->nr_cpus_allowed > 1 && +	    dl_task(rq->curr) && +	    (rq->curr->nr_cpus_allowed < 2 || +	     dl_entity_preempt(&rq->curr->dl, &p->dl))) { +		push_dl_tasks(rq); +	} +} + +static void set_cpus_allowed_dl(struct task_struct *p, +				const struct cpumask *new_mask) +{ +	struct rq *rq; +	int weight; + +	BUG_ON(!dl_task(p)); + +	/* +	 * Update only if the task is actually running (i.e., +	 * it is on the rq AND it is not throttled). +	 */ +	if (!on_dl_rq(&p->dl)) +		return; + +	weight = cpumask_weight(new_mask); + +	/* +	 * Only update if the process changes its state from whether it +	 * can migrate or not. +	 */ +	if ((p->nr_cpus_allowed > 1) == (weight > 1)) +		return; + +	rq = task_rq(p); + +	/* +	 * The process used to be able to migrate OR it can now migrate +	 */ +	if (weight <= 1) { +		if (!task_current(rq, p)) +			dequeue_pushable_dl_task(rq, p); +		BUG_ON(!rq->dl.dl_nr_migratory); +		rq->dl.dl_nr_migratory--; +	} else { +		if (!task_current(rq, p)) +			enqueue_pushable_dl_task(rq, p); +		rq->dl.dl_nr_migratory++; +	} + +	update_dl_migration(&rq->dl); +} + +/* Assumes rq->lock is held */ +static void rq_online_dl(struct rq *rq) +{ +	if (rq->dl.overloaded) +		dl_set_overload(rq); + +	if (rq->dl.dl_nr_running > 0) +		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); +} + +/* Assumes rq->lock is held */ +static void rq_offline_dl(struct rq *rq) +{ +	if (rq->dl.overloaded) +		dl_clear_overload(rq); + +	cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); +} + +void init_sched_dl_class(void) +{ +	unsigned int i; + +	for_each_possible_cpu(i) +		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), +					GFP_KERNEL, cpu_to_node(i)); +} + +#endif /* CONFIG_SMP */ + +static void switched_from_dl(struct rq *rq, struct task_struct *p) +{ +	if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) +		hrtimer_try_to_cancel(&p->dl.dl_timer); + +#ifdef CONFIG_SMP +	/* +	 * Since this might be the only -deadline task on the rq, +	 * this is the right place to try to pull some other one +	 * from an overloaded cpu, if any. +	 */ +	if (!rq->dl.dl_nr_running) +		pull_dl_task(rq); +#endif +} + +/* + * When switching to -deadline, we may overload the rq, then + * we try to push someone off, if possible. + */ +static void switched_to_dl(struct rq *rq, struct task_struct *p) +{ +	int check_resched = 1; + +	/* +	 * If p is throttled, don't consider the possibility +	 * of preempting rq->curr, the check will be done right +	 * after its runtime will get replenished. +	 */ +	if (unlikely(p->dl.dl_throttled)) +		return; + +	if (p->on_rq && rq->curr != p) { +#ifdef CONFIG_SMP +		if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) +			/* Only reschedule if pushing failed */ +			check_resched = 0; +#endif /* CONFIG_SMP */ +		if (check_resched && task_has_dl_policy(rq->curr)) +			check_preempt_curr_dl(rq, p, 0); +	} +} + +/* + * If the scheduling parameters of a -deadline task changed, + * a push or pull operation might be needed. + */ +static void prio_changed_dl(struct rq *rq, struct task_struct *p, +			    int oldprio) +{ +	if (p->on_rq || rq->curr == p) { +#ifdef CONFIG_SMP +		/* +		 * This might be too much, but unfortunately +		 * we don't have the old deadline value, and +		 * we can't argue if the task is increasing +		 * or lowering its prio, so... +		 */ +		if (!rq->dl.overloaded) +			pull_dl_task(rq); + +		/* +		 * If we now have a earlier deadline task than p, +		 * then reschedule, provided p is still on this +		 * runqueue. +		 */ +		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && +		    rq->curr == p) +			resched_task(p); +#else +		/* +		 * Again, we don't know if p has a earlier +		 * or later deadline, so let's blindly set a +		 * (maybe not needed) rescheduling point. +		 */ +		resched_task(p); +#endif /* CONFIG_SMP */ +	} else +		switched_to_dl(rq, p); +} + +const struct sched_class dl_sched_class = { +	.next			= &rt_sched_class, +	.enqueue_task		= enqueue_task_dl, +	.dequeue_task		= dequeue_task_dl, +	.yield_task		= yield_task_dl, + +	.check_preempt_curr	= check_preempt_curr_dl, + +	.pick_next_task		= pick_next_task_dl, +	.put_prev_task		= put_prev_task_dl, + +#ifdef CONFIG_SMP +	.select_task_rq		= select_task_rq_dl, +	.set_cpus_allowed       = set_cpus_allowed_dl, +	.rq_online              = rq_online_dl, +	.rq_offline             = rq_offline_dl, +	.post_schedule		= post_schedule_dl, +	.task_woken		= task_woken_dl, +#endif + +	.set_curr_task		= set_curr_task_dl, +	.task_tick		= task_tick_dl, +	.task_fork              = task_fork_dl, +	.task_dead		= task_dead_dl, + +	.prio_changed           = prio_changed_dl, +	.switched_from		= switched_from_dl, +	.switched_to		= switched_to_dl, +}; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7..627b3c34b82 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -15,6 +15,7 @@  #include <linux/seq_file.h>  #include <linux/kallsyms.h>  #include <linux/utsname.h> +#include <linux/mempolicy.h>  #include "sched.h" @@ -110,8 +111,7 @@ static char *task_group_path(struct task_group *tg)  	if (autogroup_path(tg, group_path, PATH_MAX))  		return group_path; -	cgroup_path(tg->css.cgroup, group_path, PATH_MAX); -	return group_path; +	return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);  }  #endif @@ -137,6 +137,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)  	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",  		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);  #endif +#ifdef CONFIG_NUMA_BALANCING +	SEQ_printf(m, " %d", task_node(p)); +#endif  #ifdef CONFIG_CGROUP_SCHED  	SEQ_printf(m, " %s", task_group_path(task_group(p)));  #endif @@ -159,7 +162,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)  	read_lock_irqsave(&tasklist_lock, flags);  	do_each_thread(g, p) { -		if (!p->on_rq || task_cpu(p) != rq_cpu) +		if (task_cpu(p) != rq_cpu)  			continue;  		print_task(m, rq, p); @@ -225,6 +228,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  			atomic_read(&cfs_rq->tg->runnable_avg));  #endif  #endif +#ifdef CONFIG_CFS_BANDWIDTH +	SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", +			cfs_rq->tg->cfs_bandwidth.timer_active); +	SEQ_printf(m, "  .%-30s: %d\n", "throttled", +			cfs_rq->throttled); +	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count", +			cfs_rq->throttle_count); +#endif  #ifdef CONFIG_FAIR_GROUP_SCHED  	print_cfs_group_stats(m, cpu, cfs_rq->tg); @@ -309,6 +320,7 @@ do {									\  	P(sched_goidle);  #ifdef CONFIG_SMP  	P64(avg_idle); +	P64(max_idle_balance_cost);  #endif  	P(ttwu_count); @@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)  	cpu_clk = local_clock();  	local_irq_restore(flags); -	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", +	SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",  		init_utsname()->release,  		(int)strcspn(init_utsname()->version, " "),  		init_utsname()->version); @@ -359,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)  	PN(cpu_clk);  	P(jiffies);  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -	P(sched_clock_stable); +	P(sched_clock_stable());  #endif  #undef PN  #undef P @@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)  __initcall(init_sched_debug_procfs); +#define __P(F) \ +	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) \ +	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ +	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ +	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + +static void sched_show_numa(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_NUMA_BALANCING +	struct mempolicy *pol; +	int node, i; + +	if (p->mm) +		P(mm->numa_scan_seq); + +	task_lock(p); +	pol = p->mempolicy; +	if (pol && !(pol->flags & MPOL_F_MORON)) +		pol = NULL; +	mpol_get(pol); +	task_unlock(p); + +	SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); + +	for_each_online_node(node) { +		for (i = 0; i < 2; i++) { +			unsigned long nr_faults = -1; +			int cpu_current, home_node; + +			if (p->numa_faults_memory) +				nr_faults = p->numa_faults_memory[2*node + i]; + +			cpu_current = !i ? (task_node(p) == node) : +				(pol && node_isset(node, pol->v.nodes)); + +			home_node = (p->numa_preferred_nid == node); + +			SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", +				i, node, cpu_current, home_node, nr_faults); +		} +	} + +	mpol_put(pol); +#endif +} +  void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  {  	unsigned long nr_switches; @@ -546,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  		avg_atom = p->se.sum_exec_runtime;  		if (nr_switches) -			do_div(avg_atom, nr_switches); +			avg_atom = div64_ul(avg_atom, nr_switches);  		else  			avg_atom = -1LL; @@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  		SEQ_printf(m, "%-45s:%21Ld\n",  			   "clock-delta", (long long)(t1-t0));  	} + +	sched_show_numa(p, m);  }  void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc6..fea7d3335e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -178,59 +178,61 @@ void sched_init_granularity(void)  	update_sysctl();  } -#if BITS_PER_LONG == 32 -# define WMULT_CONST	(~0UL) -#else -# define WMULT_CONST	(1UL << 32) -#endif - +#define WMULT_CONST	(~0U)  #define WMULT_SHIFT	32 -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +static void __update_inv_weight(struct load_weight *lw) +{ +	unsigned long w; + +	if (likely(lw->inv_weight)) +		return; + +	w = scale_load_down(lw->weight); + +	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) +		lw->inv_weight = 1; +	else if (unlikely(!w)) +		lw->inv_weight = WMULT_CONST; +	else +		lw->inv_weight = WMULT_CONST / w; +}  /* - * delta *= weight / lw + * delta_exec * weight / lw.weight + *   OR + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT + * + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * we're guaranteed shift stays positive because inv_weight is guaranteed to + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. + * + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus + * weight/lw.weight <= 1, and therefore our shift will also be positive.   */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, -		struct load_weight *lw) +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)  { -	u64 tmp; +	u64 fact = scale_load_down(weight); +	int shift = WMULT_SHIFT; -	/* -	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched -	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than -	 * 2^SCHED_LOAD_RESOLUTION. -	 */ -	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) -		tmp = (u64)delta_exec * scale_load_down(weight); -	else -		tmp = (u64)delta_exec; +	__update_inv_weight(lw); -	if (!lw->inv_weight) { -		unsigned long w = scale_load_down(lw->weight); - -		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) -			lw->inv_weight = 1; -		else if (unlikely(!w)) -			lw->inv_weight = WMULT_CONST; -		else -			lw->inv_weight = WMULT_CONST / w; +	if (unlikely(fact >> 32)) { +		while (fact >> 32) { +			fact >>= 1; +			shift--; +		}  	} -	/* -	 * Check whether we'd overflow the 64-bit multiplication: -	 */ -	if (unlikely(tmp > WMULT_CONST)) -		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, -			WMULT_SHIFT/2); -	else -		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); +	/* hint to use a 32x32->64 mul */ +	fact = (u64)(u32)fact * lw->inv_weight; -	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +	while (fact >> 32) { +		fact >>= 1; +		shift--; +	} + +	return mul_u64_u32_shr(delta_exec, fact, shift);  } @@ -320,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)  	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)  /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq *  is_same_group(struct sched_entity *se, struct sched_entity *pse)  {  	if (se->cfs_rq == pse->cfs_rq) -		return 1; +		return se->cfs_rq; -	return 0; +	return NULL;  }  static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -334,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)  	return se->parent;  } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ -	int depth = 0; - -	for_each_sched_entity(se) -		depth++; - -	return depth; -} -  static void  find_matching_se(struct sched_entity **se, struct sched_entity **pse)  { @@ -358,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)  	 */  	/* First walk up until both entities are at same depth */ -	se_depth = depth_se(*se); -	pse_depth = depth_se(*pse); +	se_depth = (*se)->depth; +	pse_depth = (*pse)->depth;  	while (se_depth > pse_depth) {  		se_depth--; @@ -424,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)  #define for_each_leaf_cfs_rq(rq, cfs_rq) \  		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ -	return 1; -} -  static inline struct sched_entity *parent_entity(struct sched_entity *se)  {  	return NULL; @@ -443,7 +428,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)  #endif	/* CONFIG_FAIR_GROUP_SCHED */  static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);  /**************************************************************   * Scheduling class tree data structure manipulation methods: @@ -612,11 +597,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,  /*   * delta /= w   */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)  {  	if (unlikely(se->load.weight != NICE_0_LOAD)) -		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); +		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);  	return delta;  } @@ -665,7 +649,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)  			update_load_add(&lw, se->load.weight);  			load = &lw;  		} -		slice = calc_delta_mine(slice, se->load.weight, load); +		slice = __calc_delta(slice, se->load.weight, load);  	}  	return slice;  } @@ -681,6 +665,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #ifdef CONFIG_SMP +static unsigned long task_h_load(struct task_struct *p); +  static inline void __update_task_entity_contrib(struct sched_entity *se);  /* Give new task start runnable values to heavy its load in infant time */ @@ -701,47 +687,32 @@ void init_task_runnable_average(struct task_struct *p)  #endif  /* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. + * Update the current task's runtime statistics.   */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, -	      unsigned long delta_exec) -{ -	unsigned long delta_exec_weighted; - -	schedstat_set(curr->statistics.exec_max, -		      max((u64)delta_exec, curr->statistics.exec_max)); - -	curr->sum_exec_runtime += delta_exec; -	schedstat_add(cfs_rq, exec_clock, delta_exec); -	delta_exec_weighted = calc_delta_fair(delta_exec, curr); - -	curr->vruntime += delta_exec_weighted; -	update_min_vruntime(cfs_rq); -} -  static void update_curr(struct cfs_rq *cfs_rq)  {  	struct sched_entity *curr = cfs_rq->curr;  	u64 now = rq_clock_task(rq_of(cfs_rq)); -	unsigned long delta_exec; +	u64 delta_exec;  	if (unlikely(!curr))  		return; -	/* -	 * Get the amount of time the current task was running -	 * since the last time we changed load (this cannot -	 * overflow on 32 bits): -	 */ -	delta_exec = (unsigned long)(now - curr->exec_start); -	if (!delta_exec) +	delta_exec = now - curr->exec_start; +	if (unlikely((s64)delta_exec <= 0))  		return; -	__update_curr(cfs_rq, curr, delta_exec);  	curr->exec_start = now; +	schedstat_set(curr->statistics.exec_max, +		      max(delta_exec, curr->statistics.exec_max)); + +	curr->sum_exec_runtime += delta_exec; +	schedstat_add(cfs_rq, exec_clock, delta_exec); + +	curr->vruntime += calc_delta_fair(delta_exec, curr); +	update_min_vruntime(cfs_rq); +  	if (entity_is_task(curr)) {  		struct task_struct *curtask = task_of(curr); @@ -818,11 +789,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)  #ifdef CONFIG_NUMA_BALANCING  /* - * numa task sample period in ms + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size.   */ -unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*50; -unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000;  /* Portion of address space to scan in MB */  unsigned int sysctl_numa_balancing_scan_size = 256; @@ -830,41 +802,1057 @@ unsigned int sysctl_numa_balancing_scan_size = 256;  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */  unsigned int sysctl_numa_balancing_scan_delay = 1000; -static void task_numa_placement(struct task_struct *p) +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ +	unsigned long rss = 0; +	unsigned long nr_scan_pages; + +	/* +	 * Calculations based on RSS as non-present and empty pages are skipped +	 * by the PTE scanner and NUMA hinting faults should be trapped based +	 * on resident pages +	 */ +	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); +	rss = get_mm_rss(p->mm); +	if (!rss) +		rss = nr_scan_pages; + +	rss = round_up(rss, nr_scan_pages); +	return rss / nr_scan_pages; +} + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ +	unsigned int scan, floor; +	unsigned int windows = 1; + +	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) +		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; +	floor = 1000 / windows; + +	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); +	return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ +	unsigned int smin = task_scan_min(p); +	unsigned int smax; + +	/* Watch for min being lower than max due to floor calculations */ +	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); +	return max(smin, smax); +} + +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +	rq->nr_numa_running += (p->numa_preferred_nid != -1); +	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p)  { -	int seq; +	rq->nr_numa_running -= (p->numa_preferred_nid != -1); +	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + +struct numa_group { +	atomic_t refcount; + +	spinlock_t lock; /* nr_tasks, tasks */ +	int nr_tasks; +	pid_t gid; +	struct list_head task_list; + +	struct rcu_head rcu; +	nodemask_t active_nodes; +	unsigned long total_faults; +	/* +	 * Faults_cpu is used to decide whether memory should move +	 * towards the CPU. As a consequence, these stats are weighted +	 * more by CPU use than by memory faults. +	 */ +	unsigned long *faults_cpu; +	unsigned long faults[0]; +}; + +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) -	if (!p->mm)	/* for example, ksmd faulting in a user's mm */ +pid_t task_numa_group_id(struct task_struct *p) +{ +	return p->numa_group ? p->numa_group->gid : 0; +} + +static inline int task_faults_idx(int nid, int priv) +{ +	return NR_NUMA_HINT_FAULT_TYPES * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ +	if (!p->numa_faults_memory) +		return 0; + +	return p->numa_faults_memory[task_faults_idx(nid, 0)] + +		p->numa_faults_memory[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ +	if (!p->numa_group) +		return 0; + +	return p->numa_group->faults[task_faults_idx(nid, 0)] + +		p->numa_group->faults[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ +	return group->faults_cpu[task_faults_idx(nid, 0)] + +		group->faults_cpu[task_faults_idx(nid, 1)]; +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node.  The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid) +{ +	unsigned long total_faults; + +	if (!p->numa_faults_memory) +		return 0; + +	total_faults = p->total_numa_faults; + +	if (!total_faults) +		return 0; + +	return 1000 * task_faults(p, nid) / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid) +{ +	if (!p->numa_group || !p->numa_group->total_faults) +		return 0; + +	return 1000 * group_faults(p, nid) / p->numa_group->total_faults; +} + +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +				int src_nid, int dst_cpu) +{ +	struct numa_group *ng = p->numa_group; +	int dst_nid = cpu_to_node(dst_cpu); +	int last_cpupid, this_cpupid; + +	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + +	/* +	 * Multi-stage node selection is used in conjunction with a periodic +	 * migration fault to build a temporal task<->page relation. By using +	 * a two-stage filter we remove short/unlikely relations. +	 * +	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate +	 * a task's usage of a particular page (n_p) per total usage of this +	 * page (n_t) (in a given time-span) to a probability. +	 * +	 * Our periodic faults will sample this probability and getting the +	 * same result twice in a row, given these samples are fully +	 * independent, is then given by P(n)^2, provided our sample period +	 * is sufficiently short compared to the usage pattern. +	 * +	 * This quadric squishes small probabilities, making it less likely we +	 * act on an unlikely task<->page relation. +	 */ +	last_cpupid = page_cpupid_xchg_last(page, this_cpupid); +	if (!cpupid_pid_unset(last_cpupid) && +				cpupid_to_nid(last_cpupid) != dst_nid) +		return false; + +	/* Always allow migrate on private faults */ +	if (cpupid_match_pid(p, last_cpupid)) +		return true; + +	/* A shared fault, but p->numa_group has not been set up yet. */ +	if (!ng) +		return true; + +	/* +	 * Do not migrate if the destination is not a node that +	 * is actively used by this numa group. +	 */ +	if (!node_isset(dst_nid, ng->active_nodes)) +		return false; + +	/* +	 * Source is a node that is not actively used by this +	 * numa group, while the destination is. Migrate. +	 */ +	if (!node_isset(src_nid, ng->active_nodes)) +		return true; + +	/* +	 * Both source and destination are nodes in active +	 * use by this numa group. Maximize memory bandwidth +	 * by migrating from more heavily used groups, to less +	 * heavily used ones, spreading the load around. +	 * Use a 1/4 hysteresis to avoid spurious page movement. +	 */ +	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + +static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long capacity_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); + +/* Cached statistics for all CPUs within a node */ +struct numa_stats { +	unsigned long nr_running; +	unsigned long load; + +	/* Total compute capacity of CPUs on a node */ +	unsigned long compute_capacity; + +	/* Approximate capacity in terms of runnable tasks on a node */ +	unsigned long task_capacity; +	int has_free_capacity; +}; + +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct numa_stats *ns, int nid) +{ +	int cpu, cpus = 0; + +	memset(ns, 0, sizeof(*ns)); +	for_each_cpu(cpu, cpumask_of_node(nid)) { +		struct rq *rq = cpu_rq(cpu); + +		ns->nr_running += rq->nr_running; +		ns->load += weighted_cpuload(cpu); +		ns->compute_capacity += capacity_of(cpu); + +		cpus++; +	} + +	/* +	 * If we raced with hotplug and there are no CPUs left in our mask +	 * the @ns structure is NULL'ed and task_numa_compare() will +	 * not find this node attractive. +	 * +	 * We'll either bail at !has_free_capacity, or we'll detect a huge +	 * imbalance and bail there. +	 */ +	if (!cpus)  		return; + +	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; +	ns->task_capacity = +		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); +	ns->has_free_capacity = (ns->nr_running < ns->task_capacity); +} + +struct task_numa_env { +	struct task_struct *p; + +	int src_cpu, src_nid; +	int dst_cpu, dst_nid; + +	struct numa_stats src_stats, dst_stats; + +	int imbalance_pct; + +	struct task_struct *best_task; +	long best_imp; +	int best_cpu; +}; + +static void task_numa_assign(struct task_numa_env *env, +			     struct task_struct *p, long imp) +{ +	if (env->best_task) +		put_task_struct(env->best_task); +	if (p) +		get_task_struct(p); + +	env->best_task = p; +	env->best_imp = imp; +	env->best_cpu = env->dst_cpu; +} + +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, +				long src_load, long dst_load, +				struct task_numa_env *env) +{ +	long imb, old_imb; + +	/* We care about the slope of the imbalance, not the direction. */ +	if (dst_load < src_load) +		swap(dst_load, src_load); + +	/* Is the difference below the threshold? */ +	imb = dst_load * 100 - src_load * env->imbalance_pct; +	if (imb <= 0) +		return false; + +	/* +	 * The imbalance is above the allowed threshold. +	 * Compare it with the old imbalance. +	 */ +	if (orig_dst_load < orig_src_load) +		swap(orig_dst_load, orig_src_load); + +	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + +	/* Would this change make things worse? */ +	return (imb > old_imb); +} + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static void task_numa_compare(struct task_numa_env *env, +			      long taskimp, long groupimp) +{ +	struct rq *src_rq = cpu_rq(env->src_cpu); +	struct rq *dst_rq = cpu_rq(env->dst_cpu); +	struct task_struct *cur; +	long orig_src_load, src_load; +	long orig_dst_load, dst_load; +	long load; +	long imp = (groupimp > 0) ? groupimp : taskimp; + +	rcu_read_lock(); +	cur = ACCESS_ONCE(dst_rq->curr); +	if (cur->pid == 0) /* idle */ +		cur = NULL; + +	/* +	 * "imp" is the fault differential for the source task between the +	 * source and destination node. Calculate the total differential for +	 * the source task and potential destination task. The more negative +	 * the value is, the more rmeote accesses that would be expected to +	 * be incurred if the tasks were swapped. +	 */ +	if (cur) { +		/* Skip this swap candidate if cannot move to the source cpu */ +		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) +			goto unlock; + +		/* +		 * If dst and source tasks are in the same NUMA group, or not +		 * in any group then look only at task weights. +		 */ +		if (cur->numa_group == env->p->numa_group) { +			imp = taskimp + task_weight(cur, env->src_nid) - +			      task_weight(cur, env->dst_nid); +			/* +			 * Add some hysteresis to prevent swapping the +			 * tasks within a group over tiny differences. +			 */ +			if (cur->numa_group) +				imp -= imp/16; +		} else { +			/* +			 * Compare the group weights. If a task is all by +			 * itself (not part of a group), use the task weight +			 * instead. +			 */ +			if (env->p->numa_group) +				imp = groupimp; +			else +				imp = taskimp; + +			if (cur->numa_group) +				imp += group_weight(cur, env->src_nid) - +				       group_weight(cur, env->dst_nid); +			else +				imp += task_weight(cur, env->src_nid) - +				       task_weight(cur, env->dst_nid); +		} +	} + +	if (imp < env->best_imp) +		goto unlock; + +	if (!cur) { +		/* Is there capacity at our destination? */ +		if (env->src_stats.has_free_capacity && +		    !env->dst_stats.has_free_capacity) +			goto unlock; + +		goto balance; +	} + +	/* Balance doesn't matter much if we're running a task per cpu */ +	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) +		goto assign; + +	/* +	 * In the overloaded case, try and keep the load balanced. +	 */ +balance: +	orig_dst_load = env->dst_stats.load; +	orig_src_load = env->src_stats.load; + +	/* XXX missing capacity terms */ +	load = task_h_load(env->p); +	dst_load = orig_dst_load + load; +	src_load = orig_src_load - load; + +	if (cur) { +		load = task_h_load(cur); +		dst_load -= load; +		src_load += load; +	} + +	if (load_too_imbalanced(orig_src_load, orig_dst_load, +				src_load, dst_load, env)) +		goto unlock; + +assign: +	task_numa_assign(env, cur, imp); +unlock: +	rcu_read_unlock(); +} + +static void task_numa_find_cpu(struct task_numa_env *env, +				long taskimp, long groupimp) +{ +	int cpu; + +	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { +		/* Skip this CPU if the source task cannot migrate */ +		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) +			continue; + +		env->dst_cpu = cpu; +		task_numa_compare(env, taskimp, groupimp); +	} +} + +static int task_numa_migrate(struct task_struct *p) +{ +	struct task_numa_env env = { +		.p = p, + +		.src_cpu = task_cpu(p), +		.src_nid = task_node(p), + +		.imbalance_pct = 112, + +		.best_task = NULL, +		.best_imp = 0, +		.best_cpu = -1 +	}; +	struct sched_domain *sd; +	unsigned long taskweight, groupweight; +	int nid, ret; +	long taskimp, groupimp; + +	/* +	 * Pick the lowest SD_NUMA domain, as that would have the smallest +	 * imbalance and would be the first to start moving tasks about. +	 * +	 * And we want to avoid any moving of tasks about, as that would create +	 * random movement of tasks -- counter the numa conditions we're trying +	 * to satisfy here. +	 */ +	rcu_read_lock(); +	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); +	if (sd) +		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; +	rcu_read_unlock(); + +	/* +	 * Cpusets can break the scheduler domain tree into smaller +	 * balance domains, some of which do not cross NUMA boundaries. +	 * Tasks that are "trapped" in such domains cannot be migrated +	 * elsewhere, so there is no point in (re)trying. +	 */ +	if (unlikely(!sd)) { +		p->numa_preferred_nid = task_node(p); +		return -EINVAL; +	} + +	taskweight = task_weight(p, env.src_nid); +	groupweight = group_weight(p, env.src_nid); +	update_numa_stats(&env.src_stats, env.src_nid); +	env.dst_nid = p->numa_preferred_nid; +	taskimp = task_weight(p, env.dst_nid) - taskweight; +	groupimp = group_weight(p, env.dst_nid) - groupweight; +	update_numa_stats(&env.dst_stats, env.dst_nid); + +	/* If the preferred nid has free capacity, try to use it. */ +	if (env.dst_stats.has_free_capacity) +		task_numa_find_cpu(&env, taskimp, groupimp); + +	/* No space available on the preferred nid. Look elsewhere. */ +	if (env.best_cpu == -1) { +		for_each_online_node(nid) { +			if (nid == env.src_nid || nid == p->numa_preferred_nid) +				continue; + +			/* Only consider nodes where both task and groups benefit */ +			taskimp = task_weight(p, nid) - taskweight; +			groupimp = group_weight(p, nid) - groupweight; +			if (taskimp < 0 && groupimp < 0) +				continue; + +			env.dst_nid = nid; +			update_numa_stats(&env.dst_stats, env.dst_nid); +			task_numa_find_cpu(&env, taskimp, groupimp); +		} +	} + +	/* No better CPU than the current one was found. */ +	if (env.best_cpu == -1) +		return -EAGAIN; + +	/* +	 * If the task is part of a workload that spans multiple NUMA nodes, +	 * and is migrating into one of the workload's active nodes, remember +	 * this node as the task's preferred numa node, so the workload can +	 * settle down. +	 * A task that migrated to a second choice node will be better off +	 * trying for a better one later. Do not set the preferred node here. +	 */ +	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) +		sched_setnuma(p, env.dst_nid); + +	/* +	 * Reset the scan period if the task is being rescheduled on an +	 * alternative node to recheck if the tasks is now properly placed. +	 */ +	p->numa_scan_period = task_scan_min(p); + +	if (env.best_task == NULL) { +		ret = migrate_task_to(p, env.best_cpu); +		if (ret != 0) +			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); +		return ret; +	} + +	ret = migrate_swap(p, env.best_task); +	if (ret != 0) +		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); +	put_task_struct(env.best_task); +	return ret; +} + +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) +{ +	unsigned long interval = HZ; + +	/* This task has no NUMA fault statistics yet */ +	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) +		return; + +	/* Periodically retry migrating the task to the preferred node */ +	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); +	p->numa_migrate_retry = jiffies + interval; + +	/* Success if task is already running on preferred CPU */ +	if (task_node(p) == p->numa_preferred_nid) +		return; + +	/* Otherwise, try migrate to a CPU on the preferred node */ +	task_numa_migrate(p); +} + +/* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ +	unsigned long faults, max_faults = 0; +	int nid; + +	for_each_online_node(nid) { +		faults = group_faults_cpu(numa_group, nid); +		if (faults > max_faults) +			max_faults = faults; +	} + +	for_each_online_node(nid) { +		faults = group_faults_cpu(numa_group, nid); +		if (!node_isset(nid, numa_group->active_nodes)) { +			if (faults > max_faults * 6 / 16) +				node_set(nid, numa_group->active_nodes); +		} else if (faults < max_faults * 3 / 16) +			node_clear(nid, numa_group->active_nodes); +	} +} + +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, +			unsigned long shared, unsigned long private) +{ +	unsigned int period_slot; +	int ratio; +	int diff; + +	unsigned long remote = p->numa_faults_locality[0]; +	unsigned long local = p->numa_faults_locality[1]; + +	/* +	 * If there were no record hinting faults then either the task is +	 * completely idle or all activity is areas that are not of interest +	 * to automatic numa balancing. Scan slower +	 */ +	if (local + shared == 0) { +		p->numa_scan_period = min(p->numa_scan_period_max, +			p->numa_scan_period << 1); + +		p->mm->numa_next_scan = jiffies + +			msecs_to_jiffies(p->numa_scan_period); + +		return; +	} + +	/* +	 * Prepare to scale scan period relative to the current period. +	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same +	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) +	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) +	 */ +	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); +	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); +	if (ratio >= NUMA_PERIOD_THRESHOLD) { +		int slot = ratio - NUMA_PERIOD_THRESHOLD; +		if (!slot) +			slot = 1; +		diff = slot * period_slot; +	} else { +		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + +		/* +		 * Scale scan rate increases based on sharing. There is an +		 * inverse relationship between the degree of sharing and +		 * the adjustment made to the scanning period. Broadly +		 * speaking the intent is that there is little point +		 * scanning faster if shared accesses dominate as it may +		 * simply bounce migrations uselessly +		 */ +		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); +		diff = (diff * ratio) / NUMA_PERIOD_SLOTS; +	} + +	p->numa_scan_period = clamp(p->numa_scan_period + diff, +			task_scan_min(p), task_scan_max(p)); +	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ +	u64 runtime, delta, now; +	/* Use the start of this time slice to avoid calculations. */ +	now = p->se.exec_start; +	runtime = p->se.sum_exec_runtime; + +	if (p->last_task_numa_placement) { +		delta = runtime - p->last_sum_exec_runtime; +		*period = now - p->last_task_numa_placement; +	} else { +		delta = p->se.avg.runnable_avg_sum; +		*period = p->se.avg.runnable_avg_period; +	} + +	p->last_sum_exec_runtime = runtime; +	p->last_task_numa_placement = now; + +	return delta; +} + +static void task_numa_placement(struct task_struct *p) +{ +	int seq, nid, max_nid = -1, max_group_nid = -1; +	unsigned long max_faults = 0, max_group_faults = 0; +	unsigned long fault_types[2] = { 0, 0 }; +	unsigned long total_faults; +	u64 runtime, period; +	spinlock_t *group_lock = NULL; +  	seq = ACCESS_ONCE(p->mm->numa_scan_seq);  	if (p->numa_scan_seq == seq)  		return;  	p->numa_scan_seq = seq; +	p->numa_scan_period_max = task_scan_max(p); + +	total_faults = p->numa_faults_locality[0] + +		       p->numa_faults_locality[1]; +	runtime = numa_get_avg_runtime(p, &period); + +	/* If the task is part of a group prevent parallel updates to group stats */ +	if (p->numa_group) { +		group_lock = &p->numa_group->lock; +		spin_lock_irq(group_lock); +	} + +	/* Find the node with the highest number of faults */ +	for_each_online_node(nid) { +		unsigned long faults = 0, group_faults = 0; +		int priv, i; + +		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { +			long diff, f_diff, f_weight; + +			i = task_faults_idx(nid, priv); + +			/* Decay existing window, copy faults since last scan */ +			diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; +			fault_types[priv] += p->numa_faults_buffer_memory[i]; +			p->numa_faults_buffer_memory[i] = 0; + +			/* +			 * Normalize the faults_from, so all tasks in a group +			 * count according to CPU use, instead of by the raw +			 * number of faults. Tasks with little runtime have +			 * little over-all impact on throughput, and thus their +			 * faults are less important. +			 */ +			f_weight = div64_u64(runtime << 16, period + 1); +			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / +				   (total_faults + 1); +			f_diff = f_weight - p->numa_faults_cpu[i] / 2; +			p->numa_faults_buffer_cpu[i] = 0; + +			p->numa_faults_memory[i] += diff; +			p->numa_faults_cpu[i] += f_diff; +			faults += p->numa_faults_memory[i]; +			p->total_numa_faults += diff; +			if (p->numa_group) { +				/* safe because we can only change our own group */ +				p->numa_group->faults[i] += diff; +				p->numa_group->faults_cpu[i] += f_diff; +				p->numa_group->total_faults += diff; +				group_faults += p->numa_group->faults[i]; +			} +		} + +		if (faults > max_faults) { +			max_faults = faults; +			max_nid = nid; +		} + +		if (group_faults > max_group_faults) { +			max_group_faults = group_faults; +			max_group_nid = nid; +		} +	} + +	update_task_scan_period(p, fault_types[0], fault_types[1]); + +	if (p->numa_group) { +		update_numa_active_node_mask(p->numa_group); +		/* +		 * If the preferred task and group nids are different, +		 * iterate over the nodes again to find the best place. +		 */ +		if (max_nid != max_group_nid) { +			unsigned long weight, max_weight = 0; + +			for_each_online_node(nid) { +				weight = task_weight(p, nid) + group_weight(p, nid); +				if (weight > max_weight) { +					max_weight = weight; +					max_nid = nid; +				} +			} +		} + +		spin_unlock_irq(group_lock); +	} + +	/* Preferred node as the node with the most faults */ +	if (max_faults && max_nid != p->numa_preferred_nid) { +		/* Update the preferred nid and migrate task if possible */ +		sched_setnuma(p, max_nid); +		numa_migrate_preferred(p); +	} +} + +static inline int get_numa_group(struct numa_group *grp) +{ +	return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ +	if (atomic_dec_and_test(&grp->refcount)) +		kfree_rcu(grp, rcu); +} + +static void task_numa_group(struct task_struct *p, int cpupid, int flags, +			int *priv) +{ +	struct numa_group *grp, *my_grp; +	struct task_struct *tsk; +	bool join = false; +	int cpu = cpupid_to_cpu(cpupid); +	int i; + +	if (unlikely(!p->numa_group)) { +		unsigned int size = sizeof(struct numa_group) + +				    4*nr_node_ids*sizeof(unsigned long); + +		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); +		if (!grp) +			return; + +		atomic_set(&grp->refcount, 1); +		spin_lock_init(&grp->lock); +		INIT_LIST_HEAD(&grp->task_list); +		grp->gid = p->pid; +		/* Second half of the array tracks nids where faults happen */ +		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * +						nr_node_ids; -	/* FIXME: Scheduling placement policy hints go here */ +		node_set(task_node(current), grp->active_nodes); + +		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) +			grp->faults[i] = p->numa_faults_memory[i]; + +		grp->total_faults = p->total_numa_faults; + +		list_add(&p->numa_entry, &grp->task_list); +		grp->nr_tasks++; +		rcu_assign_pointer(p->numa_group, grp); +	} + +	rcu_read_lock(); +	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + +	if (!cpupid_match_pid(tsk, cpupid)) +		goto no_join; + +	grp = rcu_dereference(tsk->numa_group); +	if (!grp) +		goto no_join; + +	my_grp = p->numa_group; +	if (grp == my_grp) +		goto no_join; + +	/* +	 * Only join the other group if its bigger; if we're the bigger group, +	 * the other task will join us. +	 */ +	if (my_grp->nr_tasks > grp->nr_tasks) +		goto no_join; + +	/* +	 * Tie-break on the grp address. +	 */ +	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) +		goto no_join; + +	/* Always join threads in the same process. */ +	if (tsk->mm == current->mm) +		join = true; + +	/* Simple filter to avoid false positives due to PID collisions */ +	if (flags & TNF_SHARED) +		join = true; + +	/* Update priv based on whether false sharing was detected */ +	*priv = !join; + +	if (join && !get_numa_group(grp)) +		goto no_join; + +	rcu_read_unlock(); + +	if (!join) +		return; + +	BUG_ON(irqs_disabled()); +	double_lock_irq(&my_grp->lock, &grp->lock); + +	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { +		my_grp->faults[i] -= p->numa_faults_memory[i]; +		grp->faults[i] += p->numa_faults_memory[i]; +	} +	my_grp->total_faults -= p->total_numa_faults; +	grp->total_faults += p->total_numa_faults; + +	list_move(&p->numa_entry, &grp->task_list); +	my_grp->nr_tasks--; +	grp->nr_tasks++; + +	spin_unlock(&my_grp->lock); +	spin_unlock_irq(&grp->lock); + +	rcu_assign_pointer(p->numa_group, grp); + +	put_numa_group(my_grp); +	return; + +no_join: +	rcu_read_unlock(); +	return; +} + +void task_numa_free(struct task_struct *p) +{ +	struct numa_group *grp = p->numa_group; +	void *numa_faults = p->numa_faults_memory; +	unsigned long flags; +	int i; + +	if (grp) { +		spin_lock_irqsave(&grp->lock, flags); +		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) +			grp->faults[i] -= p->numa_faults_memory[i]; +		grp->total_faults -= p->total_numa_faults; + +		list_del(&p->numa_entry); +		grp->nr_tasks--; +		spin_unlock_irqrestore(&grp->lock, flags); +		rcu_assign_pointer(p->numa_group, NULL); +		put_numa_group(grp); +	} + +	p->numa_faults_memory = NULL; +	p->numa_faults_buffer_memory = NULL; +	p->numa_faults_cpu= NULL; +	p->numa_faults_buffer_cpu = NULL; +	kfree(numa_faults);  }  /*   * Got a PROT_NONE fault for a page on @node.   */ -void task_numa_fault(int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  {  	struct task_struct *p = current; +	bool migrated = flags & TNF_MIGRATED; +	int cpu_node = task_node(current); +	int local = !!(flags & TNF_FAULT_LOCAL); +	int priv;  	if (!numabalancing_enabled)  		return; -	/* FIXME: Allocate task-specific structure for placement policy here */ +	/* for example, ksmd faulting in a user's mm */ +	if (!p->mm) +		return; + +	/* Do not worry about placement if exiting */ +	if (p->state == TASK_DEAD) +		return; + +	/* Allocate buffer to track faults on a per-node basis */ +	if (unlikely(!p->numa_faults_memory)) { +		int size = sizeof(*p->numa_faults_memory) * +			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; + +		p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); +		if (!p->numa_faults_memory) +			return; + +		BUG_ON(p->numa_faults_buffer_memory); +		/* +		 * The averaged statistics, shared & private, memory & cpu, +		 * occupy the first half of the array. The second half of the +		 * array is for current counters, which are averaged into the +		 * first set by task_numa_placement. +		 */ +		p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); +		p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); +		p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); +		p->total_numa_faults = 0; +		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +	} + +	/* +	 * First accesses are treated as private, otherwise consider accesses +	 * to be private if the accessing pid has not changed +	 */ +	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { +		priv = 1; +	} else { +		priv = cpupid_match_pid(p, last_cpupid); +		if (!priv && !(flags & TNF_NO_GROUP)) +			task_numa_group(p, last_cpupid, flags, &priv); +	}  	/* -	 * If pages are properly placed (did not migrate) then scan slower. -	 * This is reset periodically in case of phase changes +	 * If a workload spans multiple NUMA nodes, a shared fault that +	 * occurs wholly within the set of nodes that the workload is +	 * actively using should be counted as local. This allows the +	 * scan rate to slow down when a workload has settled down.  	 */ -        if (!migrated) -		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, -			p->numa_scan_period + jiffies_to_msecs(10)); +	if (!priv && !local && p->numa_group && +			node_isset(cpu_node, p->numa_group->active_nodes) && +			node_isset(mem_node, p->numa_group->active_nodes)) +		local = 1;  	task_numa_placement(p); + +	/* +	 * Retry task to preferred node migration periodically, in case it +	 * case it previously failed, or the scheduler moved us. +	 */ +	if (time_after(jiffies, p->numa_migrate_retry)) +		numa_migrate_preferred(p); + +	if (migrated) +		p->numa_pages_migrated += pages; + +	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; +	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; +	p->numa_faults_locality[local] += pages;  }  static void reset_ptenuma_scan(struct task_struct *p) @@ -884,6 +1872,7 @@ void task_numa_work(struct callback_head *work)  	struct mm_struct *mm = p->mm;  	struct vm_area_struct *vma;  	unsigned long start, end; +	unsigned long nr_pte_updates = 0;  	long pages;  	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -900,35 +1889,9 @@ void task_numa_work(struct callback_head *work)  	if (p->flags & PF_EXITING)  		return; -	/* -	 * We do not care about task placement until a task runs on a node -	 * other than the first one used by the address space. This is -	 * largely because migrations are driven by what CPU the task -	 * is running on. If it's never scheduled on another node, it'll -	 * not migrate so why bother trapping the fault. -	 */ -	if (mm->first_nid == NUMA_PTE_SCAN_INIT) -		mm->first_nid = numa_node_id(); -	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { -		/* Are we running on a new node yet? */ -		if (numa_node_id() == mm->first_nid && -		    !sched_feat_numa(NUMA_FORCE)) -			return; - -		mm->first_nid = NUMA_PTE_SCAN_ACTIVE; -	} - -	/* -	 * Reset the scan period if enough time has gone by. Objective is that -	 * scanning will be reduced if pages are properly placed. As tasks -	 * can enter different phases this needs to be re-examined. Lacking -	 * proper tracking of reference behaviour, this blunt hammer is used. -	 */ -	migrate = mm->numa_next_reset; -	if (time_after(now, migrate)) { -		p->numa_scan_period = sysctl_numa_balancing_scan_period_min; -		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); -		xchg(&mm->numa_next_reset, next_scan); +	if (!mm->numa_next_scan) { +		mm->numa_next_scan = now + +			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);  	}  	/* @@ -938,20 +1901,20 @@ void task_numa_work(struct callback_head *work)  	if (time_before(now, migrate))  		return; -	if (p->numa_scan_period == 0) -		p->numa_scan_period = sysctl_numa_balancing_scan_period_min; +	if (p->numa_scan_period == 0) { +		p->numa_scan_period_max = task_scan_max(p); +		p->numa_scan_period = task_scan_min(p); +	}  	next_scan = now + msecs_to_jiffies(p->numa_scan_period);  	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)  		return;  	/* -	 * Do not set pte_numa if the current running node is rate-limited. -	 * This loses statistics on the fault but if we are unwilling to -	 * migrate to this node, it is less likely we can do useful work +	 * Delay this task enough that another task of this mm will likely win +	 * the next time around.  	 */ -	if (migrate_ratelimited(numa_node_id())) -		return; +	p->node_stamp += 2 * TICK_NSEC;  	start = mm->numa_scan_offset;  	pages = sysctl_numa_balancing_scan_size; @@ -967,31 +1930,54 @@ void task_numa_work(struct callback_head *work)  		vma = mm->mmap;  	}  	for (; vma; vma = vma->vm_next) { -		if (!vma_migratable(vma)) +		if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) +			continue; + +		/* +		 * Shared library pages mapped by multiple processes are not +		 * migrated as it is expected they are cache replicated. Avoid +		 * hinting faults in read-only file-backed mappings or the vdso +		 * as migrating the pages will be of marginal benefit. +		 */ +		if (!vma->vm_mm || +		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))  			continue; -		/* Skip small VMAs. They are not likely to be of relevance */ -		if (vma->vm_end - vma->vm_start < HPAGE_SIZE) +		/* +		 * Skip inaccessible VMAs to avoid any confusion between +		 * PROT_NONE and NUMA hinting ptes +		 */ +		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))  			continue;  		do {  			start = max(start, vma->vm_start);  			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);  			end = min(end, vma->vm_end); -			pages -= change_prot_numa(vma, start, end); +			nr_pte_updates += change_prot_numa(vma, start, end); + +			/* +			 * Scan sysctl_numa_balancing_scan_size but ensure that +			 * at least one PTE is updated so that unused virtual +			 * address space is quickly skipped. +			 */ +			if (nr_pte_updates) +				pages -= (end - start) >> PAGE_SHIFT;  			start = end;  			if (pages <= 0)  				goto out; + +			cond_resched();  		} while (end != vma->vm_end);  	}  out:  	/* -	 * It is possible to reach the end of the VMA list but the last few VMAs are -	 * not guaranteed to the vma_migratable. If they are not, we would find the -	 * !migratable VMA on the next scan but not reset the scanner to the start -	 * so check it now. +	 * It is possible to reach the end of the VMA list but the last few +	 * VMAs are not guaranteed to the vma_migratable. If they are not, we +	 * would find the !migratable VMA on the next scan but not reset the +	 * scanner to the start so check it now.  	 */  	if (vma)  		mm->numa_scan_offset = start; @@ -1025,8 +2011,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  	if (now - curr->node_stamp > period) {  		if (!curr->node_stamp) -			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; -		curr->node_stamp = now; +			curr->numa_scan_period = task_scan_min(curr); +		curr->node_stamp += period;  		if (!time_before(jiffies, curr->mm->numa_next_scan)) {  			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ @@ -1038,6 +2024,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  static void task_tick_numa(struct rq *rq, struct task_struct *curr)  {  } + +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +}  #endif /* CONFIG_NUMA_BALANCING */  static void @@ -1047,8 +2041,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	if (!parent_entity(se))  		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);  #ifdef CONFIG_SMP -	if (entity_is_task(se)) -		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +	if (entity_is_task(se)) { +		struct rq *rq = rq_of(cfs_rq); + +		account_numa_enqueue(rq, task_of(se)); +		list_add(&se->group_node, &rq->cfs_tasks); +	}  #endif  	cfs_rq->nr_running++;  } @@ -1059,8 +2057,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	update_load_sub(&cfs_rq->load, se->load.weight);  	if (!parent_entity(se))  		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); -	if (entity_is_task(se)) +	if (entity_is_task(se)) { +		account_numa_dequeue(rq_of(cfs_rq), task_of(se));  		list_del_init(&se->group_node); +	}  	cfs_rq->nr_running--;  } @@ -1378,7 +2378,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,  	long contrib;  	/* The fraction of a cpu used by this cfs_rq */ -	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, +	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,  			  sa->runnable_avg_period + 1);  	contrib -= cfs_rq->tg_runnable_contrib; @@ -1429,13 +2429,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)  		se->avg.load_avg_contrib >>= NICE_0_SHIFT;  	}  } -#else + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ +	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); +	__update_tg_runnable_avg(&rq->avg, &rq->cfs); +} +#else /* CONFIG_FAIR_GROUP_SCHED */  static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,  						 int force_update) {}  static inline void __update_tg_runnable_avg(struct sched_avg *sa,  						  struct cfs_rq *cfs_rq) {}  static inline void __update_group_entity_contrib(struct sched_entity *se) {} -#endif +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +#endif /* CONFIG_FAIR_GROUP_SCHED */  static inline void __update_task_entity_contrib(struct sched_entity *se)  { @@ -1533,12 +2540,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)  	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);  } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ -	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); -	__update_tg_runnable_avg(&rq->avg, &rq->cfs); -} -  /* Add the load generated by se into cfs_rq's child load-average */  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,  						  struct sched_entity *se, @@ -1572,13 +2573,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,  		}  		wakeup = 0;  	} else { -		/* -		 * Task re-woke on same cpu (or else migrate_task_rq_fair() -		 * would have made count negative); we must be careful to avoid -		 * double-accounting blocked time after synchronizing decays. -		 */ -		se->avg.last_runnable_update += __synchronize_entity_decay(se) -							<< 20; +		__synchronize_entity_decay(se);  	}  	/* migrated tasks did not contribute to our blocked load */ @@ -1632,7 +2627,10 @@ void idle_exit_fair(struct rq *this_rq)  	update_rq_runnable_avg(this_rq, 0);  } -#else +static int idle_balance(struct rq *this_rq); + +#else /* CONFIG_SMP */ +  static inline void update_entity_load_avg(struct sched_entity *se,  					  int update_cfs_rq) {}  static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -1644,7 +2642,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,  					   int sleep) {}  static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,  					      int force_update) {} -#endif + +static inline int idle_balance(struct rq *rq) +{ +	return 0; +} + +#endif /* CONFIG_SMP */  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  { @@ -1794,10 +2798,10 @@ static void __clear_buddies_last(struct sched_entity *se)  {  	for_each_sched_entity(se) {  		struct cfs_rq *cfs_rq = cfs_rq_of(se); -		if (cfs_rq->last == se) -			cfs_rq->last = NULL; -		else +		if (cfs_rq->last != se)  			break; + +		cfs_rq->last = NULL;  	}  } @@ -1805,10 +2809,10 @@ static void __clear_buddies_next(struct sched_entity *se)  {  	for_each_sched_entity(se) {  		struct cfs_rq *cfs_rq = cfs_rq_of(se); -		if (cfs_rq->next == se) -			cfs_rq->next = NULL; -		else +		if (cfs_rq->next != se)  			break; + +		cfs_rq->next = NULL;  	}  } @@ -1816,10 +2820,10 @@ static void __clear_buddies_skip(struct sched_entity *se)  {  	for_each_sched_entity(se) {  		struct cfs_rq *cfs_rq = cfs_rq_of(se); -		if (cfs_rq->skip == se) -			cfs_rq->skip = NULL; -		else +		if (cfs_rq->skip != se)  			break; + +		cfs_rq->skip = NULL;  	}  } @@ -1962,17 +2966,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);   * 3) pick the "last" process, for cache locality   * 4) do not run the "skip" process, if something else is available   */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)  { -	struct sched_entity *se = __pick_first_entity(cfs_rq); -	struct sched_entity *left = se; +	struct sched_entity *left = __pick_first_entity(cfs_rq); +	struct sched_entity *se; + +	/* +	 * If curr is set we have to see if its left of the leftmost entity +	 * still in the tree, provided there was anything in the tree at all. +	 */ +	if (!left || (curr && entity_before(curr, left))) +		left = curr; + +	se = left; /* ideally we run the leftmost entity */  	/*  	 * Avoid running the skip buddy, if running something else can  	 * be done without getting too unfair.  	 */  	if (cfs_rq->skip == se) { -		struct sched_entity *second = __pick_next_entity(se); +		struct sched_entity *second; + +		if (se == curr) { +			second = __pick_first_entity(cfs_rq); +		} else { +			second = __pick_next_entity(se); +			if (!second || (curr && entity_before(curr, second))) +				second = curr; +		} +  		if (second && wakeup_preempt_entity(second, left) < 1)  			se = second;  	} @@ -1994,7 +3017,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)  	return se;  } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)  { @@ -2070,13 +3093,14 @@ static inline bool cfs_bandwidth_used(void)  	return static_key_false(&__cfs_bandwidth_used);  } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_inc(void)  { -	/* only need to count groups transitioning between enabled/!enabled */ -	if (enabled && !was_enabled) -		static_key_slow_inc(&__cfs_bandwidth_used); -	else if (!enabled && was_enabled) -		static_key_slow_dec(&__cfs_bandwidth_used); +	static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void) +{ +	static_key_slow_dec(&__cfs_bandwidth_used);  }  #else /* HAVE_JUMP_LABEL */  static bool cfs_bandwidth_used(void) @@ -2084,7 +3108,8 @@ static bool cfs_bandwidth_used(void)  	return true;  } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {}  #endif /* HAVE_JUMP_LABEL */  /* @@ -2156,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  		 */  		if (!cfs_b->timer_active) {  			__refill_cfs_bandwidth_runtime(cfs_b); -			__start_cfs_bandwidth(cfs_b); +			__start_cfs_bandwidth(cfs_b, false);  		}  		if (cfs_b->runtime > 0) { @@ -2201,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)  	 * has not truly expired.  	 *  	 * Fortunately we can check determine whether this the case by checking -	 * whether the global deadline has advanced. +	 * whether the global deadline has advanced. It is valid to compare +	 * cfs_b->runtime_expires without any locks since we only care about +	 * exact equality, so a partial write will still work.  	 */ -	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { +	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {  		/* extend local deadline, drift is bounded above by 2 ticks */  		cfs_rq->runtime_expires += TICK_NSEC;  	} else { @@ -2213,8 +3240,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)  	}  } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, -				     unsigned long delta_exec) +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)  {  	/* dock delta_exec before expiring quota (as it could span periods) */  	cfs_rq->runtime_remaining -= delta_exec; @@ -2232,7 +3258,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,  }  static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)  {  	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)  		return; @@ -2329,12 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running -= task_delta; +		sub_nr_running(rq, task_delta);  	cfs_rq->throttled = 1;  	cfs_rq->throttled_clock = rq_clock(rq);  	raw_spin_lock(&cfs_b->lock);  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	if (!cfs_b->timer_active) +		__start_cfs_bandwidth(cfs_b, false);  	raw_spin_unlock(&cfs_b->lock);  } @@ -2378,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	}  	if (!se) -		rq->nr_running += task_delta; +		add_nr_running(rq, task_delta);  	/* determine whether we need to wake up potentially idle cpu */  	if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -2432,28 +3460,35 @@ next:  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  {  	u64 runtime, runtime_expires; -	int idle = 1, throttled; +	int throttled; -	raw_spin_lock(&cfs_b->lock);  	/* no need to continue the timer with no bandwidth constraint */  	if (cfs_b->quota == RUNTIME_INF) -		goto out_unlock; +		goto out_deactivate;  	throttled = !list_empty(&cfs_b->throttled_cfs_rq); -	/* idle depends on !throttled (for the case of a large deficit) */ -	idle = cfs_b->idle && !throttled;  	cfs_b->nr_periods += overrun; -	/* if we're going inactive then everything else can be deferred */ -	if (idle) -		goto out_unlock; +	/* +	 * idle depends on !throttled (for the case of a large deficit), and if +	 * we're going inactive then everything else can be deferred +	 */ +	if (cfs_b->idle && !throttled) +		goto out_deactivate; + +	/* +	 * if we have relooped after returning idle once, we need to update our +	 * status as actually running, so that other cpus doing +	 * __start_cfs_bandwidth will stop trying to cancel us. +	 */ +	cfs_b->timer_active = 1;  	__refill_cfs_bandwidth_runtime(cfs_b);  	if (!throttled) {  		/* mark as potentially idle for the upcoming period */  		cfs_b->idle = 1; -		goto out_unlock; +		return 0;  	}  	/* account preceding periods in which throttling occurred */ @@ -2493,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  	 * timer to remain active while there are any throttled entities.)  	 */  	cfs_b->idle = 0; -out_unlock: -	if (idle) -		cfs_b->timer_active = 0; -	raw_spin_unlock(&cfs_b->lock); -	return idle; +	return 0; + +out_deactivate: +	cfs_b->timer_active = 0; +	return 1;  }  /* a cfs_rq won't donate quota below this amount */ @@ -2508,7 +3543,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;  /* how long we wait to gather additional slack before distributing */  static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */  static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)  {  	struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -2584,10 +3625,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	u64 expires;  	/* confirm we're still not at a refresh boundary */ -	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) +	raw_spin_lock(&cfs_b->lock); +	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { +		raw_spin_unlock(&cfs_b->lock);  		return; +	} -	raw_spin_lock(&cfs_b->lock);  	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {  		runtime = cfs_b->runtime;  		cfs_b->runtime = 0; @@ -2631,22 +3674,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)  }  /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)  {  	if (!cfs_bandwidth_used()) -		return; +		return false;  	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) -		return; +		return false;  	/*  	 * it's possible for a throttled entity to be forced into a running  	 * state (e.g. set_curr_task), in this case we're finished.  	 */  	if (cfs_rq_throttled(cfs_rq)) -		return; +		return true;  	throttle_cfs_rq(cfs_rq); +	return true;  }  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -2666,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)  	int overrun;  	int idle = 0; +	raw_spin_lock(&cfs_b->lock);  	for (;;) {  		now = hrtimer_cb_get_time(timer);  		overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -2675,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)  		idle = do_sched_cfs_period_timer(cfs_b, overrun);  	} +	raw_spin_unlock(&cfs_b->lock);  	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;  } @@ -2700,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)  }  /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)  {  	/*  	 * The timer may be active because we're trying to set a new bandwidth @@ -2708,14 +3754,14 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	 * (timer_active==0 becomes visible before the hrtimer call-back  	 * terminates).  In either case we ensure that it's re-programmed  	 */ -	while (unlikely(hrtimer_active(&cfs_b->period_timer))) { +	while (unlikely(hrtimer_active(&cfs_b->period_timer)) && +	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { +		/* bounce the lock to allow do_sched_cfs_period_timer to run */  		raw_spin_unlock(&cfs_b->lock); -		/* ensure cfs_b->lock is available while we wait */ -		hrtimer_cancel(&cfs_b->period_timer); - +		cpu_relax();  		raw_spin_lock(&cfs_b->lock);  		/* if someone else restarted the timer then we're done */ -		if (cfs_b->timer_active) +		if (!force && cfs_b->timer_active)  			return;  	} @@ -2734,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  	struct cfs_rq *cfs_rq;  	for_each_leaf_cfs_rq(rq, cfs_rq) { -		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); -  		if (!cfs_rq->runtime_enabled)  			continue; @@ -2743,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  		 * clock_task is not advancing so we just need to make sure  		 * there's some valid quota amount  		 */ -		cfs_rq->runtime_remaining = cfs_b->quota; +		cfs_rq->runtime_remaining = 1;  		if (cfs_rq_throttled(cfs_rq))  			unthrottle_cfs_rq(cfs_rq);  	} @@ -2755,9 +3799,8 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)  	return rq_clock_task(rq_of(cfs_rq));  } -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, -				     unsigned long delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2895,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	if (!se) {  		update_rq_runnable_avg(rq, rq->nr_running); -		inc_nr_running(rq); +		add_nr_running(rq, 1);  	}  	hrtick_update(rq);  } @@ -2955,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  	}  	if (!se) { -		dec_nr_running(rq); +		sub_nr_running(rq, 1);  		update_rq_runnable_avg(rq, 1);  	}  	hrtick_update(rq); @@ -3001,9 +4044,9 @@ static unsigned long target_load(int cpu, int type)  	return max(rq->cpu_load[type-1], total);  } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu)  { -	return cpu_rq(cpu)->cpu_power; +	return cpu_rq(cpu)->cpu_capacity;  }  static unsigned long cpu_avg_load_per_task(int cpu) @@ -3025,8 +4068,8 @@ static void record_wakee(struct task_struct *p)  	 * about the boundary, really active task won't care  	 * about the loss.  	 */ -	if (jiffies > current->wakee_flip_decay_ts + HZ) { -		current->wakee_flips = 0; +	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { +		current->wakee_flips >>= 1;  		current->wakee_flip_decay_ts = jiffies;  	} @@ -3166,8 +4209,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  }  #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, -		unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  {  	return wl;  } @@ -3247,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)  		s64 this_eff_load, prev_eff_load;  		this_eff_load = 100; -		this_eff_load *= power_of(prev_cpu); +		this_eff_load *= capacity_of(prev_cpu);  		this_eff_load *= this_load +  			effective_load(tg, this_cpu, weight, weight);  		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; -		prev_eff_load *= power_of(this_cpu); +		prev_eff_load *= capacity_of(this_cpu);  		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);  		balanced = this_eff_load <= prev_eff_load; @@ -3292,12 +4334,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)   */  static struct sched_group *  find_idlest_group(struct sched_domain *sd, struct task_struct *p, -		  int this_cpu, int load_idx) +		  int this_cpu, int sd_flag)  {  	struct sched_group *idlest = NULL, *group = sd->groups;  	unsigned long min_load = ULONG_MAX, this_load = 0; +	int load_idx = sd->forkexec_idx;  	int imbalance = 100 + (sd->imbalance_pct-100)/2; +	if (sd_flag & SD_BALANCE_WAKE) +		load_idx = sd->wake_idx; +  	do {  		unsigned long load, avg_load;  		int local_group; @@ -3324,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,  			avg_load += load;  		} -		/* Adjust by relative CPU power of the group */ -		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; +		/* Adjust by relative CPU capacity of the group */ +		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;  		if (local_group) {  			this_load = avg_load; @@ -3409,22 +4455,22 @@ done:  }  /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC.   * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.   * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number.   *   * preempt must be disabled.   */  static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)  {  	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;  	int cpu = smp_processor_id(); -	int prev_cpu = task_cpu(p);  	int new_cpu = cpu;  	int want_affine = 0;  	int sync = wake_flags & WF_SYNC; @@ -3457,16 +4503,15 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)  			sd = tmp;  	} -	if (affine_sd) { -		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) -			prev_cpu = cpu; +	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) +		prev_cpu = cpu; +	if (sd_flag & SD_BALANCE_WAKE) {  		new_cpu = select_idle_sibling(p, prev_cpu);  		goto unlock;  	}  	while (sd) { -		int load_idx = sd->forkexec_idx;  		struct sched_group *group;  		int weight; @@ -3475,10 +4520,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)  			continue;  		} -		if (sd_flag & SD_BALANCE_WAKE) -			load_idx = sd->wake_idx; - -		group = find_idlest_group(sd, p, cpu, load_idx); +		group = find_idlest_group(sd, p, cpu, sd_flag);  		if (!group) {  			sd = sd->child;  			continue; @@ -3532,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)  		atomic_long_add(se->avg.load_avg_contrib,  						&cfs_rq->removed_load);  	} + +	/* We have migrated, no longer consider this task hot */ +	se->exec_start = 0;  }  #endif /* CONFIG_SMP */ @@ -3695,26 +4740,124 @@ preempt:  		set_last_buddy(se);  } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev)  { -	struct task_struct *p;  	struct cfs_rq *cfs_rq = &rq->cfs;  	struct sched_entity *se; +	struct task_struct *p; +	int new_tasks; +again: +#ifdef CONFIG_FAIR_GROUP_SCHED  	if (!cfs_rq->nr_running) -		return NULL; +		goto idle; + +	if (prev->sched_class != &fair_sched_class) +		goto simple; + +	/* +	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather +	 * likely that a next task is from the same cgroup as the current. +	 * +	 * Therefore attempt to avoid putting and setting the entire cgroup +	 * hierarchy, only change the part that actually changes. +	 */ + +	do { +		struct sched_entity *curr = cfs_rq->curr; + +		/* +		 * Since we got here without doing put_prev_entity() we also +		 * have to consider cfs_rq->curr. If it is still a runnable +		 * entity, update_curr() will update its vruntime, otherwise +		 * forget we've ever seen it. +		 */ +		if (curr && curr->on_rq) +			update_curr(cfs_rq); +		else +			curr = NULL; + +		/* +		 * This call to check_cfs_rq_runtime() will do the throttle and +		 * dequeue its entity in the parent(s). Therefore the 'simple' +		 * nr_running test will indeed be correct. +		 */ +		if (unlikely(check_cfs_rq_runtime(cfs_rq))) +			goto simple; + +		se = pick_next_entity(cfs_rq, curr); +		cfs_rq = group_cfs_rq(se); +	} while (cfs_rq); + +	p = task_of(se); + +	/* +	 * Since we haven't yet done put_prev_entity and if the selected task +	 * is a different task than we started out with, try and touch the +	 * least amount of cfs_rqs. +	 */ +	if (prev != p) { +		struct sched_entity *pse = &prev->se; + +		while (!(cfs_rq = is_same_group(se, pse))) { +			int se_depth = se->depth; +			int pse_depth = pse->depth; + +			if (se_depth <= pse_depth) { +				put_prev_entity(cfs_rq_of(pse), pse); +				pse = parent_entity(pse); +			} +			if (se_depth >= pse_depth) { +				set_next_entity(cfs_rq_of(se), se); +				se = parent_entity(se); +			} +		} + +		put_prev_entity(cfs_rq, pse); +		set_next_entity(cfs_rq, se); +	} + +	if (hrtick_enabled(rq)) +		hrtick_start_fair(rq, p); + +	return p; +simple: +	cfs_rq = &rq->cfs; +#endif + +	if (!cfs_rq->nr_running) +		goto idle; + +	put_prev_task(rq, prev);  	do { -		se = pick_next_entity(cfs_rq); +		se = pick_next_entity(cfs_rq, NULL);  		set_next_entity(cfs_rq, se);  		cfs_rq = group_cfs_rq(se);  	} while (cfs_rq);  	p = task_of(se); +  	if (hrtick_enabled(rq))  		hrtick_start_fair(rq, p);  	return p; + +idle: +	new_tasks = idle_balance(rq); +	/* +	 * Because idle_balance() releases (and re-acquires) rq->lock, it is +	 * possible for any higher priority task to appear. In that case we +	 * must re-start the pick_next_entity() loop. +	 */ +	if (new_tasks < 0) +		return RETRY_TASK; + +	if (new_tasks > 0) +		goto again; + +	return NULL;  }  /* @@ -3808,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp   *   *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)   * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the   * fraction of 'recent' time available for SCHED_OTHER task execution. But it   * can also include other factors [XXX].   *   * To achieve this balance we define a measure of imbalance which follows   * directly from (1):   * - *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4) + *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)   *   * We them move tasks around to minimize the imbalance. In the continuous   * function space it is obvious this converges, in the discrete case we get @@ -3904,9 +5047,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp  static unsigned long __read_mostly max_load_balance_interval = HZ/10; +enum fbq_type { regular, remote, all }; +  #define LBF_ALL_PINNED	0x01  #define LBF_NEED_BREAK	0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED  0x04 +#define LBF_SOME_PINNED	0x08  struct lb_env {  	struct sched_domain	*sd; @@ -3929,6 +5075,8 @@ struct lb_env {  	unsigned int		loop;  	unsigned int		loop_break;  	unsigned int		loop_max; + +	enum fbq_type		fbq_type;  };  /* @@ -3947,7 +5095,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)   * Is this task likely cache-hot:   */  static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now)  {  	s64 delta; @@ -3975,6 +5123,94 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  	return delta < (s64)sysctl_sched_migration_cost;  } +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ +	struct numa_group *numa_group = rcu_dereference(p->numa_group); +	int src_nid, dst_nid; + +	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || +	    !(env->sd->flags & SD_NUMA)) { +		return false; +	} + +	src_nid = cpu_to_node(env->src_cpu); +	dst_nid = cpu_to_node(env->dst_cpu); + +	if (src_nid == dst_nid) +		return false; + +	if (numa_group) { +		/* Task is already in the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return true; + +		return group_faults(p, dst_nid) > group_faults(p, src_nid); +	} + +	/* Encourage migration to the preferred node. */ +	if (dst_nid == p->numa_preferred_nid) +		return true; + +	return task_faults(p, dst_nid) > task_faults(p, src_nid); +} + + +static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +{ +	struct numa_group *numa_group = rcu_dereference(p->numa_group); +	int src_nid, dst_nid; + +	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) +		return false; + +	if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) +		return false; + +	src_nid = cpu_to_node(env->src_cpu); +	dst_nid = cpu_to_node(env->dst_cpu); + +	if (src_nid == dst_nid) +		return false; + +	if (numa_group) { +		/* Task is moving within/into the group's interleave set. */ +		if (node_isset(dst_nid, numa_group->active_nodes)) +			return false; + +		/* Task is moving out of the group's interleave set. */ +		if (node_isset(src_nid, numa_group->active_nodes)) +			return true; + +		return group_faults(p, dst_nid) < group_faults(p, src_nid); +	} + +	/* Migrating away from the preferred node is always bad. */ +	if (src_nid == p->numa_preferred_nid) +		return true; + +	return task_faults(p, dst_nid) < task_faults(p, src_nid); +} + +#else +static inline bool migrate_improves_locality(struct task_struct *p, +					     struct lb_env *env) +{ +	return false; +} + +static inline bool migrate_degrades_locality(struct task_struct *p, +					     struct lb_env *env) +{ +	return false; +} +#endif +  /*   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?   */ @@ -3997,6 +5233,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine); +		env->flags |= LBF_SOME_PINNED; +  		/*  		 * Remember if this task can be migrated to any other cpu in  		 * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +5243,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  		 * Also avoid computing new_dst_cpu if we have already computed  		 * one in current iteration.  		 */ -		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) +		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))  			return 0;  		/* Prevent to re-select dst_cpu via env's cpus */  		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {  			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { -				env->flags |= LBF_SOME_PINNED; +				env->flags |= LBF_DST_PINNED;  				env->new_dst_cpu = cpu;  				break;  			} @@ -4030,11 +5268,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	/*  	 * Aggressive migration if: -	 * 1) task is cache cold, or -	 * 2) too many balance attempts have failed. +	 * 1) destination numa is preferred +	 * 2) task is cache cold, or +	 * 3) too many balance attempts have failed.  	 */ +	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); +	if (!tsk_cache_hot) +		tsk_cache_hot = migrate_degrades_locality(p, env); + +	if (migrate_improves_locality(p, env)) { +#ifdef CONFIG_SCHEDSTATS +		if (tsk_cache_hot) { +			schedstat_inc(env->sd, lb_hot_gained[env->idle]); +			schedstat_inc(p, se.statistics.nr_forced_migrations); +		} +#endif +		return 1; +	} -	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);  	if (!tsk_cache_hot ||  		env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4077,8 +5328,6 @@ static int move_one_task(struct lb_env *env)  	return 0;  } -static unsigned long task_h_load(struct task_struct *p); -  static const unsigned int sched_nr_migrate_break = 32;  /* @@ -4284,13 +5533,17 @@ struct sg_lb_stats {  	unsigned long group_load; /* Total load over the CPUs of the group */  	unsigned long sum_weighted_load; /* Weighted load of group's tasks */  	unsigned long load_per_task; -	unsigned long group_power; +	unsigned long group_capacity;  	unsigned int sum_nr_running; /* Nr tasks running in the group */ -	unsigned int group_capacity; +	unsigned int group_capacity_factor;  	unsigned int idle_cpus;  	unsigned int group_weight;  	int group_imb; /* Is there an imbalance in the group ? */ -	int group_has_capacity; /* Is there extra capacity in the group? */ +	int group_has_free_capacity; +#ifdef CONFIG_NUMA_BALANCING +	unsigned int nr_numa_running; +	unsigned int nr_preferred_running; +#endif  };  /* @@ -4301,7 +5554,7 @@ struct sd_lb_stats {  	struct sched_group *busiest;	/* Busiest group in this sd */  	struct sched_group *local;	/* Local group in this sd */  	unsigned long total_load;	/* Total load of all groups in sd */ -	unsigned long total_pwr;	/* Total power of all groups in sd */ +	unsigned long total_capacity;	/* Total capacity of all groups in sd */  	unsigned long avg_load;	/* Average load across all groups in sd */  	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -4320,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)  		.busiest = NULL,  		.local = NULL,  		.total_load = 0UL, -		.total_pwr = 0UL, +		.total_capacity = 0UL,  		.busiest_stat = {  			.avg_load = 0UL,  		}, @@ -4330,7 +5583,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)  /**   * get_sd_load_idx - Obtain the load index for a given sched domain.   * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * @idle: The idle status of the CPU for whose sd load_idx is obtained.   *   * Return: The load index.   */ @@ -4355,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,  	return load_idx;  } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)  { -	return SCHED_POWER_SCALE; +	return SCHED_CAPACITY_SCALE;  } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)  { -	return default_scale_freq_power(sd, cpu); +	return default_scale_capacity(sd, cpu);  } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)  {  	unsigned long weight = sd->span_weight;  	unsigned long smt_gain = sd->smt_gain; @@ -4375,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)  	return smt_gain;  } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)  { -	return default_scale_smt_power(sd, cpu); +	return default_scale_smt_capacity(sd, cpu);  } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	u64 total, available, age_stamp, avg; +	s64 delta;  	/*  	 * Since we're reading these variables without serialization make sure @@ -4392,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu)  	age_stamp = ACCESS_ONCE(rq->age_stamp);  	avg = ACCESS_ONCE(rq->rt_avg); -	total = sched_avg_period() + (rq_clock(rq) - age_stamp); +	delta = rq_clock(rq) - age_stamp; +	if (unlikely(delta < 0)) +		delta = 0; + +	total = sched_avg_period() + delta;  	if (unlikely(total < avg)) { -		/* Ensures that power won't end up being negative */ +		/* Ensures that capacity won't end up being negative */  		available = 0;  	} else {  		available = total - avg;  	} -	if (unlikely((s64)total < SCHED_POWER_SCALE)) -		total = SCHED_POWER_SCALE; +	if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) +		total = SCHED_CAPACITY_SCALE; -	total >>= SCHED_POWER_SHIFT; +	total >>= SCHED_CAPACITY_SHIFT;  	return div_u64(available, total);  } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu)  {  	unsigned long weight = sd->span_weight; -	unsigned long power = SCHED_POWER_SCALE; +	unsigned long capacity = SCHED_CAPACITY_SCALE;  	struct sched_group *sdg = sd->groups; -	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { -		if (sched_feat(ARCH_POWER)) -			power *= arch_scale_smt_power(sd, cpu); +	if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { +		if (sched_feat(ARCH_CAPACITY)) +			capacity *= arch_scale_smt_capacity(sd, cpu);  		else -			power *= default_scale_smt_power(sd, cpu); +			capacity *= default_scale_smt_capacity(sd, cpu); -		power >>= SCHED_POWER_SHIFT; +		capacity >>= SCHED_CAPACITY_SHIFT;  	} -	sdg->sgp->power_orig = power; +	sdg->sgc->capacity_orig = capacity; -	if (sched_feat(ARCH_POWER)) -		power *= arch_scale_freq_power(sd, cpu); +	if (sched_feat(ARCH_CAPACITY)) +		capacity *= arch_scale_freq_capacity(sd, cpu);  	else -		power *= default_scale_freq_power(sd, cpu); +		capacity *= default_scale_capacity(sd, cpu); -	power >>= SCHED_POWER_SHIFT; +	capacity >>= SCHED_CAPACITY_SHIFT; -	power *= scale_rt_power(cpu); -	power >>= SCHED_POWER_SHIFT; +	capacity *= scale_rt_capacity(cpu); +	capacity >>= SCHED_CAPACITY_SHIFT; -	if (!power) -		power = 1; +	if (!capacity) +		capacity = 1; -	cpu_rq(cpu)->cpu_power = power; -	sdg->sgp->power = power; +	cpu_rq(cpu)->cpu_capacity = capacity; +	sdg->sgc->capacity = capacity;  } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu)  {  	struct sched_domain *child = sd->child;  	struct sched_group *group, *sdg = sd->groups; -	unsigned long power; +	unsigned long capacity, capacity_orig;  	unsigned long interval;  	interval = msecs_to_jiffies(sd->balance_interval);  	interval = clamp(interval, 1UL, max_load_balance_interval); -	sdg->sgp->next_update = jiffies + interval; +	sdg->sgc->next_update = jiffies + interval;  	if (!child) { -		update_cpu_power(sd, cpu); +		update_cpu_capacity(sd, cpu);  		return;  	} -	power = 0; +	capacity_orig = capacity = 0;  	if (child->flags & SD_OVERLAP) {  		/* @@ -4467,8 +5725,33 @@ void update_group_power(struct sched_domain *sd, int cpu)  		 * span the current group.  		 */ -		for_each_cpu(cpu, sched_group_cpus(sdg)) -			power += power_of(cpu); +		for_each_cpu(cpu, sched_group_cpus(sdg)) { +			struct sched_group_capacity *sgc; +			struct rq *rq = cpu_rq(cpu); + +			/* +			 * build_sched_domains() -> init_sched_groups_capacity() +			 * gets here before we've attached the domains to the +			 * runqueues. +			 * +			 * Use capacity_of(), which is set irrespective of domains +			 * in update_cpu_capacity(). +			 * +			 * This avoids capacity/capacity_orig from being 0 and +			 * causing divide-by-zero issues on boot. +			 * +			 * Runtime updates will correct capacity_orig. +			 */ +			if (unlikely(!rq->sd)) { +				capacity_orig += capacity_of(cpu); +				capacity += capacity_of(cpu); +				continue; +			} + +			sgc = rq->sd->groups->sgc; +			capacity_orig += sgc->capacity_orig; +			capacity += sgc->capacity; +		}  	} else  {  		/*  		 * !SD_OVERLAP domains can assume that child groups @@ -4477,12 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu)  		group = child->groups;  		do { -			power += group->sgp->power; +			capacity_orig += group->sgc->capacity_orig; +			capacity += group->sgc->capacity;  			group = group->next;  		} while (group != child->groups);  	} -	sdg->sgp->power_orig = sdg->sgp->power = power; +	sdg->sgc->capacity_orig = capacity_orig; +	sdg->sgc->capacity = capacity;  }  /* @@ -4496,15 +5781,15 @@ static inline int  fix_small_capacity(struct sched_domain *sd, struct sched_group *group)  {  	/* -	 * Only siblings can have significantly less than SCHED_POWER_SCALE +	 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE  	 */ -	if (!(sd->flags & SD_SHARE_CPUPOWER)) +	if (!(sd->flags & SD_SHARE_CPUCAPACITY))  		return 0;  	/* -	 * If ~90% of the cpu_power is still there, we're good. +	 * If ~90% of the cpu_capacity is still there, we're good.  	 */ -	if (group->sgp->power * 32 > group->sgp->power_orig * 29) +	if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)  		return 1;  	return 0; @@ -4526,13 +5811,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)   * cpu 3 and leave one of the cpus in the second group unused.   *   * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints.   *   * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * update_sd_pick_busiest(). And calculate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditions to allow it   * to create an effective group imbalance.   *   * This is a somewhat tricky proposition since the next run might not find the @@ -4540,49 +5824,37 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)   * subtle and fragile situation.   */ -struct sg_imb_stats { -	unsigned long max_nr_running, min_nr_running; -	unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group)  { -	sgi->max_cpu_load = sgi->max_nr_running = 0UL; -	sgi->min_cpu_load = sgi->min_nr_running = ~0UL; +	return group->sgc->imbalance;  } -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, -		    unsigned long load, unsigned long nr_running) +/* + * Compute the group capacity factor. + * + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit unit capacity with that. + */ +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)  { -	if (load > sgi->max_cpu_load) -		sgi->max_cpu_load = load; -	if (sgi->min_cpu_load > load) -		sgi->min_cpu_load = load; +	unsigned int capacity_factor, smt, cpus; +	unsigned int capacity, capacity_orig; -	if (nr_running > sgi->max_nr_running) -		sgi->max_nr_running = nr_running; -	if (sgi->min_nr_running > nr_running) -		sgi->min_nr_running = nr_running; -} +	capacity = group->sgc->capacity; +	capacity_orig = group->sgc->capacity_orig; +	cpus = group->group_weight; -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) -{ -	/* -	 * Consider the group unbalanced when the imbalance is larger -	 * than the average weight of a task. -	 * -	 * APZ: with cgroup the avg task weight can vary wildly and -	 *      might not be a suitable number - should we keep a -	 *      normalized nr_running number somewhere that negates -	 *      the hierarchy? -	 */ -	if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && -	    (sgi->max_nr_running - sgi->min_nr_running) > 1) -		return 1; +	/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ +	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); +	capacity_factor = cpus / smt; /* cores */ -	return 0; +	capacity_factor = min_t(unsigned, +		capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); +	if (!capacity_factor) +		capacity_factor = fix_small_capacity(env->sd, group); + +	return capacity_factor;  }  /** @@ -4597,56 +5869,45 @@ static inline void update_sg_lb_stats(struct lb_env *env,  			struct sched_group *group, int load_idx,  			int local_group, struct sg_lb_stats *sgs)  { -	struct sg_imb_stats sgi; -	unsigned long nr_running;  	unsigned long load;  	int i; -	init_sg_imb_stats(&sgi); +	memset(sgs, 0, sizeof(*sgs));  	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {  		struct rq *rq = cpu_rq(i); -		nr_running = rq->nr_running; -  		/* Bias balancing toward cpus of our domain */ -		if (local_group) { +		if (local_group)  			load = target_load(i, load_idx); -		} else { +		else  			load = source_load(i, load_idx); -			update_sg_imb_stats(&sgi, load, nr_running); -		}  		sgs->group_load += load; -		sgs->sum_nr_running += nr_running; +		sgs->sum_nr_running += rq->nr_running; +#ifdef CONFIG_NUMA_BALANCING +		sgs->nr_numa_running += rq->nr_numa_running; +		sgs->nr_preferred_running += rq->nr_preferred_running; +#endif  		sgs->sum_weighted_load += weighted_cpuload(i);  		if (idle_cpu(i))  			sgs->idle_cpus++;  	} -	if (local_group && (env->idle != CPU_NEWLY_IDLE || -			time_after_eq(jiffies, group->sgp->next_update))) -		update_group_power(env->sd, env->dst_cpu); - -	/* Adjust by relative CPU power of the group */ -	sgs->group_power = group->sgp->power; -	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; +	/* Adjust by relative CPU capacity of the group */ +	sgs->group_capacity = group->sgc->capacity; +	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;  	if (sgs->sum_nr_running)  		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; -	sgs->group_imb = sg_imbalanced(sgs, &sgi); - -	sgs->group_capacity = -		DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - -	if (!sgs->group_capacity) -		sgs->group_capacity = fix_small_capacity(env->sd, group); -  	sgs->group_weight = group->group_weight; -	if (sgs->group_capacity > sgs->sum_nr_running) -		sgs->group_has_capacity = 1; +	sgs->group_imb = sg_imbalanced(group); +	sgs->group_capacity_factor = sg_capacity_factor(env, group); + +	if (sgs->group_capacity_factor > sgs->sum_nr_running) +		sgs->group_has_free_capacity = 1;  }  /** @@ -4670,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,  	if (sgs->avg_load <= sds->busiest_stat.avg_load)  		return false; -	if (sgs->sum_nr_running > sgs->group_capacity) +	if (sgs->sum_nr_running > sgs->group_capacity_factor)  		return true;  	if (sgs->group_imb) @@ -4693,14 +5954,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,  	return false;  } +#ifdef CONFIG_NUMA_BALANCING +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ +	if (sgs->sum_nr_running > sgs->nr_numa_running) +		return regular; +	if (sgs->sum_nr_running > sgs->nr_preferred_running) +		return remote; +	return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ +	if (rq->nr_running > rq->nr_numa_running) +		return regular; +	if (rq->nr_running > rq->nr_preferred_running) +		return remote; +	return all; +} +#else +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ +	return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ +	return regular; +} +#endif /* CONFIG_NUMA_BALANCING */ +  /**   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.   * @env: The load balancing environment. - * @balance: Should we balance.   * @sds: variable to hold the statistics for this sched_domain.   */ -static inline void update_sd_lb_stats(struct lb_env *env, -					struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)  {  	struct sched_domain *child = env->sd->child;  	struct sched_group *sg = env->sd->groups; @@ -4720,36 +6009,46 @@ static inline void update_sd_lb_stats(struct lb_env *env,  		if (local_group) {  			sds->local = sg;  			sgs = &sds->local_stat; + +			if (env->idle != CPU_NEWLY_IDLE || +			    time_after_eq(jiffies, sg->sgc->next_update)) +				update_group_capacity(env->sd, env->dst_cpu);  		} -		memset(sgs, 0, sizeof(*sgs));  		update_sg_lb_stats(env, sg, load_idx, local_group, sgs); +		if (local_group) +			goto next_group; +  		/*  		 * In case the child domain prefers tasks go to siblings -		 * first, lower the sg capacity to one so that we'll try +		 * first, lower the sg capacity factor to one so that we'll try  		 * and move all the excess tasks away. We lower the capacity  		 * of a group only if the local group has the capacity to fit -		 * these excess tasks, i.e. nr_running < group_capacity. The +		 * these excess tasks, i.e. nr_running < group_capacity_factor. The  		 * extra check prevents the case where you always pull from the  		 * heaviest group when it is already under-utilized (possible  		 * with a large weight task outweighs the tasks on the system).  		 */ -		if (prefer_sibling && !local_group && -				sds->local && sds->local_stat.group_has_capacity) -			sgs->group_capacity = min(sgs->group_capacity, 1U); - -		/* Now, start updating sd_lb_stats */ -		sds->total_load += sgs->group_load; -		sds->total_pwr += sgs->group_power; +		if (prefer_sibling && sds->local && +		    sds->local_stat.group_has_free_capacity) +			sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); -		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { +		if (update_sd_pick_busiest(env, sds, sg, sgs)) {  			sds->busiest = sg;  			sds->busiest_stat = *sgs;  		} +next_group: +		/* Now, start updating sd_lb_stats */ +		sds->total_load += sgs->group_load; +		sds->total_capacity += sgs->group_capacity; +  		sg = sg->next;  	} while (sg != env->sd->groups); + +	if (env->sd->flags & SD_NUMA) +		env->fbq_type = fbq_classify_group(&sds->busiest_stat);  }  /** @@ -4790,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)  		return 0;  	env->imbalance = DIV_ROUND_CLOSEST( -		sds->busiest_stat.avg_load * sds->busiest_stat.group_power, -		SCHED_POWER_SCALE); +		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, +		SCHED_CAPACITY_SCALE);  	return 1;  } @@ -4806,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)  static inline  void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)  { -	unsigned long tmp, pwr_now = 0, pwr_move = 0; +	unsigned long tmp, capa_now = 0, capa_move = 0;  	unsigned int imbn = 2;  	unsigned long scaled_busy_load_per_task;  	struct sg_lb_stats *local, *busiest; @@ -4820,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)  		imbn = 1;  	scaled_busy_load_per_task = -		(busiest->load_per_task * SCHED_POWER_SCALE) / -		busiest->group_power; +		(busiest->load_per_task * SCHED_CAPACITY_SCALE) / +		busiest->group_capacity;  	if (busiest->avg_load + scaled_busy_load_per_task >=  	    local->avg_load + (scaled_busy_load_per_task * imbn)) { @@ -4831,40 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)  	/*  	 * OK, we don't have enough imbalance to justify moving tasks, -	 * however we may be able to increase total CPU power used by +	 * however we may be able to increase total CPU capacity used by  	 * moving them.  	 */ -	pwr_now += busiest->group_power * +	capa_now += busiest->group_capacity *  			min(busiest->load_per_task, busiest->avg_load); -	pwr_now += local->group_power * +	capa_now += local->group_capacity *  			min(local->load_per_task, local->avg_load); -	pwr_now /= SCHED_POWER_SCALE; +	capa_now /= SCHED_CAPACITY_SCALE;  	/* Amount of load we'd subtract */ -	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / -		busiest->group_power; -	if (busiest->avg_load > tmp) { -		pwr_move += busiest->group_power * +	if (busiest->avg_load > scaled_busy_load_per_task) { +		capa_move += busiest->group_capacity *  			    min(busiest->load_per_task, -				busiest->avg_load - tmp); +				busiest->avg_load - scaled_busy_load_per_task);  	}  	/* Amount of load we'd add */ -	if (busiest->avg_load * busiest->group_power < -	    busiest->load_per_task * SCHED_POWER_SCALE) { -		tmp = (busiest->avg_load * busiest->group_power) / -		      local->group_power; +	if (busiest->avg_load * busiest->group_capacity < +	    busiest->load_per_task * SCHED_CAPACITY_SCALE) { +		tmp = (busiest->avg_load * busiest->group_capacity) / +		      local->group_capacity;  	} else { -		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / -		      local->group_power; +		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / +		      local->group_capacity;  	} -	pwr_move += local->group_power * +	capa_move += local->group_capacity *  		    min(local->load_per_task, local->avg_load + tmp); -	pwr_move /= SCHED_POWER_SCALE; +	capa_move /= SCHED_CAPACITY_SCALE;  	/* Move if we gain throughput */ -	if (pwr_move > pwr_now) +	if (capa_move > capa_now)  		env->imbalance = busiest->load_per_task;  } @@ -4894,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  	/*  	 * In the presence of smp nice balancing, certain scenarios can have  	 * max load less than avg load(as we skip the groups at or below -	 * its cpu_power, while calculating max_load..) +	 * its cpu_capacity, while calculating max_load..)  	 */  	if (busiest->avg_load <= sds->avg_load ||  	    local->avg_load >= sds->avg_load) { @@ -4909,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  		 * have to drop below capacity to reach cpu-load equilibrium.  		 */  		load_above_capacity = -			(busiest->sum_nr_running - busiest->group_capacity); +			(busiest->sum_nr_running - busiest->group_capacity_factor); -		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); -		load_above_capacity /= busiest->group_power; +		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); +		load_above_capacity /= busiest->group_capacity;  	}  	/* @@ -4927,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s  	/* How much load to actually move to equalise the imbalance */  	env->imbalance = min( -		max_pull * busiest->group_power, -		(sds->avg_load - local->avg_load) * local->group_power -	) / SCHED_POWER_SCALE; +		max_pull * busiest->group_capacity, +		(sds->avg_load - local->avg_load) * local->group_capacity +	) / SCHED_CAPACITY_SCALE;  	/*  	 * if *imbalance is less than the average load per runnable task @@ -4983,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  	if (!sds.busiest || busiest->sum_nr_running == 0)  		goto out_balanced; -	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; +	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) +						/ sds.total_capacity;  	/*  	 * If the busiest group is imbalanced the below checks don't @@ -4994,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  		goto force_balance;  	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ -	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && -	    !busiest->group_has_capacity) +	if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && +	    !busiest->group_has_free_capacity)  		goto force_balance;  	/* @@ -5049,42 +6347,66 @@ static struct rq *find_busiest_queue(struct lb_env *env,  				     struct sched_group *group)  {  	struct rq *busiest = NULL, *rq; -	unsigned long busiest_load = 0, busiest_power = 1; +	unsigned long busiest_load = 0, busiest_capacity = 1;  	int i;  	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { -		unsigned long power = power_of(i); -		unsigned long capacity = DIV_ROUND_CLOSEST(power, -							   SCHED_POWER_SCALE); -		unsigned long wl; - -		if (!capacity) -			capacity = fix_small_capacity(env->sd, group); +		unsigned long capacity, capacity_factor, wl; +		enum fbq_type rt;  		rq = cpu_rq(i); +		rt = fbq_classify_rq(rq); + +		/* +		 * We classify groups/runqueues into three groups: +		 *  - regular: there are !numa tasks +		 *  - remote:  there are numa tasks that run on the 'wrong' node +		 *  - all:     there is no distinction +		 * +		 * In order to avoid migrating ideally placed numa tasks, +		 * ignore those when there's better options. +		 * +		 * If we ignore the actual busiest queue to migrate another +		 * task, the next balance pass can still reduce the busiest +		 * queue by moving tasks around inside the node. +		 * +		 * If we cannot move enough load due to this classification +		 * the next pass will adjust the group classification and +		 * allow migration of more tasks. +		 * +		 * Both cases only affect the total convergence complexity. +		 */ +		if (rt > env->fbq_type) +			continue; + +		capacity = capacity_of(i); +		capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); +		if (!capacity_factor) +			capacity_factor = fix_small_capacity(env->sd, group); +  		wl = weighted_cpuload(i);  		/*  		 * When comparing with imbalance, use weighted_cpuload() -		 * which is not scaled with the cpu power. +		 * which is not scaled with the cpu capacity.  		 */ -		if (capacity && rq->nr_running == 1 && wl > env->imbalance) +		if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)  			continue;  		/*  		 * For the load comparisons with the other cpu's, consider -		 * the weighted_cpuload() scaled with the cpu power, so that -		 * the load can be moved away from the cpu that is potentially -		 * running at a lower capacity. +		 * the weighted_cpuload() scaled with the cpu capacity, so +		 * that the load can be moved away from the cpu that is +		 * potentially running at a lower capacity.  		 * -		 * Thus we're looking for max(wl_i / power_i), crosswise +		 * Thus we're looking for max(wl_i / capacity_i), crosswise  		 * multiplication to rid ourselves of the division works out -		 * to: wl_i * power_j > wl_j * power_i;  where j is our -		 * previous maximum. +		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is +		 * our previous maximum.  		 */ -		if (wl * busiest_power > busiest_load * power) { +		if (wl * busiest_capacity > busiest_load * capacity) {  			busiest_load = wl; -			busiest_power = power; +			busiest_capacity = capacity;  			busiest = rq;  		}  	} @@ -5164,6 +6486,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,  			int *continue_balancing)  {  	int ld_moved, cur_ld_moved, active_balance = 0; +	struct sched_domain *sd_parent = sd->parent;  	struct sched_group *group;  	struct rq *busiest;  	unsigned long flags; @@ -5177,6 +6500,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,  		.idle		= idle,  		.loop_break	= sched_nr_migrate_break,  		.cpus		= cpus, +		.fbq_type	= all,  	};  	/* @@ -5268,17 +6592,17 @@ more_balance:  		 * moreover subsequent load balance cycles should correct the  		 * excess load moved.  		 */ -		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { +		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + +			/* Prevent to re-select dst_cpu via env's cpus */ +			cpumask_clear_cpu(env.dst_cpu, env.cpus);  			env.dst_rq	 = cpu_rq(env.new_dst_cpu);  			env.dst_cpu	 = env.new_dst_cpu; -			env.flags	&= ~LBF_SOME_PINNED; +			env.flags	&= ~LBF_DST_PINNED;  			env.loop	 = 0;  			env.loop_break	 = sched_nr_migrate_break; -			/* Prevent to re-select dst_cpu via env's cpus */ -			cpumask_clear_cpu(env.dst_cpu, env.cpus); -  			/*  			 * Go back to "more_balance" rather than "redo" since we  			 * need to continue with same src_cpu. @@ -5286,6 +6610,18 @@ more_balance:  			goto more_balance;  		} +		/* +		 * We failed to reach balance because of affinity. +		 */ +		if (sd_parent) { +			int *group_imbalance = &sd_parent->groups->sgc->imbalance; + +			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { +				*group_imbalance = 1; +			} else if (*group_imbalance) +				*group_imbalance = 0; +		} +  		/* All tasks on this runqueue were pinned by CPU affinity */  		if (unlikely(env.flags & LBF_ALL_PINNED)) {  			cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5384,20 +6720,62 @@ out:  	return ld_moved;  } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ +	unsigned long interval = sd->balance_interval; + +	if (cpu_busy) +		interval *= sd->busy_factor; + +	/* scale ms to jiffies */ +	interval = msecs_to_jiffies(interval); +	interval = clamp(interval, 1UL, max_load_balance_interval); + +	return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ +	unsigned long interval, next; + +	interval = get_sd_balance_interval(sd, cpu_busy); +	next = sd->last_balance + interval; + +	if (time_after(*next_balance, next)) +		*next_balance = next; +} +  /*   * idle_balance is called by schedule() if this_cpu is about to become   * idle. Attempts to pull tasks from other CPUs.   */ -void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(struct rq *this_rq)  { +	unsigned long next_balance = jiffies + HZ; +	int this_cpu = this_rq->cpu;  	struct sched_domain *sd;  	int pulled_task = 0; -	unsigned long next_balance = jiffies + HZ; +	u64 curr_cost = 0; + +	idle_enter_fair(this_rq); +	/* +	 * We must set idle_stamp _before_ calling idle_balance(), such that we +	 * measure the duration of idle_balance() as idle time. +	 */  	this_rq->idle_stamp = rq_clock(this_rq); -	if (this_rq->avg_idle < sysctl_sched_migration_cost) -		return; +	if (this_rq->avg_idle < sysctl_sched_migration_cost) { +		rcu_read_lock(); +		sd = rcu_dereference_check_sched_domain(this_rq->sd); +		if (sd) +			update_next_balance(sd, 0, &next_balance); +		rcu_read_unlock(); + +		goto out; +	}  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -5407,38 +6785,70 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	update_blocked_averages(this_cpu);  	rcu_read_lock();  	for_each_domain(this_cpu, sd) { -		unsigned long interval;  		int continue_balancing = 1; +		u64 t0, domain_cost;  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; +		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { +			update_next_balance(sd, 0, &next_balance); +			break; +		} +  		if (sd->flags & SD_BALANCE_NEWIDLE) { -			/* If we've pulled tasks over stop searching: */ +			t0 = sched_clock_cpu(this_cpu); +  			pulled_task = load_balance(this_cpu, this_rq,  						   sd, CPU_NEWLY_IDLE,  						   &continue_balancing); + +			domain_cost = sched_clock_cpu(this_cpu) - t0; +			if (domain_cost > sd->max_newidle_lb_cost) +				sd->max_newidle_lb_cost = domain_cost; + +			curr_cost += domain_cost;  		} -		interval = msecs_to_jiffies(sd->balance_interval); -		if (time_after(next_balance, sd->last_balance + interval)) -			next_balance = sd->last_balance + interval; -		if (pulled_task) { -			this_rq->idle_stamp = 0; +		update_next_balance(sd, 0, &next_balance); + +		/* +		 * Stop searching for tasks to pull if there are +		 * now runnable tasks on this rq. +		 */ +		if (pulled_task || this_rq->nr_running > 0)  			break; -		}  	}  	rcu_read_unlock();  	raw_spin_lock(&this_rq->lock); -	if (pulled_task || time_after(jiffies, this_rq->next_balance)) { -		/* -		 * We are going idle. next_balance may be set based on -		 * a busy processor. So reset next_balance. -		 */ +	if (curr_cost > this_rq->max_idle_balance_cost) +		this_rq->max_idle_balance_cost = curr_cost; + +	/* +	 * While browsing the domains, we released the rq lock, a task could +	 * have been enqueued in the meantime. Since we're not going idle, +	 * pretend we pulled a task. +	 */ +	if (this_rq->cfs.h_nr_running && !pulled_task) +		pulled_task = 1; + +out: +	/* Move the next balance forward */ +	if (time_after(this_rq->next_balance, next_balance))  		this_rq->next_balance = next_balance; + +	/* Is there a task of a high priority class? */ +	if (this_rq->nr_running != this_rq->cfs.h_nr_running) +		pulled_task = -1; + +	if (pulled_task) { +		idle_exit_fair(this_rq); +		this_rq->idle_stamp = 0;  	} + +	return pulled_task;  }  /* @@ -5509,6 +6919,11 @@ out_unlock:  	return 0;  } +static inline int on_null_domain(struct rq *rq) +{ +	return unlikely(!rcu_dereference_sched(rq->sd)); +} +  #ifdef CONFIG_NO_HZ_COMMON  /*   * idle load balancing details @@ -5522,7 +6937,7 @@ static struct {  	unsigned long next_balance;     /* in jiffy units */  } nohz ____cacheline_aligned; -static inline int find_new_ilb(int call_cpu) +static inline int find_new_ilb(void)  {  	int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -5537,13 +6952,13 @@ static inline int find_new_ilb(int call_cpu)   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle   * CPU (if there is one).   */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void)  {  	int ilb_cpu;  	nohz.next_balance++; -	ilb_cpu = find_new_ilb(cpu); +	ilb_cpu = find_new_ilb();  	if (ilb_cpu >= nr_cpu_ids)  		return; @@ -5563,8 +6978,13 @@ static void nohz_balancer_kick(int cpu)  static inline void nohz_balance_exit_idle(int cpu)  {  	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { -		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); -		atomic_dec(&nohz.nr_cpus); +		/* +		 * Completely isolated CPUs don't ever set, so we must test. +		 */ +		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { +			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); +			atomic_dec(&nohz.nr_cpus); +		}  		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));  	}  } @@ -5572,16 +6992,16 @@ static inline void nohz_balance_exit_idle(int cpu)  static inline void set_cpu_sd_state_busy(void)  {  	struct sched_domain *sd; +	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference_check_sched_domain(this_rq()->sd); +	sd = rcu_dereference(per_cpu(sd_busy, cpu));  	if (!sd || !sd->nohz_idle)  		goto unlock;  	sd->nohz_idle = 0; -	for (; sd; sd = sd->parent) -		atomic_inc(&sd->groups->sgp->nr_busy_cpus); +	atomic_inc(&sd->groups->sgc->nr_busy_cpus);  unlock:  	rcu_read_unlock();  } @@ -5589,16 +7009,16 @@ unlock:  void set_cpu_sd_state_idle(void)  {  	struct sched_domain *sd; +	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference_check_sched_domain(this_rq()->sd); +	sd = rcu_dereference(per_cpu(sd_busy, cpu));  	if (!sd || sd->nohz_idle)  		goto unlock;  	sd->nohz_idle = 1; -	for (; sd; sd = sd->parent) -		atomic_dec(&sd->groups->sgp->nr_busy_cpus); +	atomic_dec(&sd->groups->sgc->nr_busy_cpus);  unlock:  	rcu_read_unlock();  } @@ -5618,6 +7038,12 @@ void nohz_balance_enter_idle(int cpu)  	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))  		return; +	/* +	 * If we're a completely isolated CPU, we don't play. +	 */ +	if (on_null_domain(cpu_rq(cpu))) +		return; +  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);  	atomic_inc(&nohz.nr_cpus);  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); @@ -5653,34 +7079,51 @@ void update_max_interval(void)   *   * Balancing parameters are set up in init_sched_domains.   */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)  {  	int continue_balancing = 1; -	struct rq *rq = cpu_rq(cpu); +	int cpu = rq->cpu;  	unsigned long interval;  	struct sched_domain *sd;  	/* Earliest time when we have to do rebalance again */  	unsigned long next_balance = jiffies + 60*HZ;  	int update_next_balance = 0; -	int need_serialize; +	int need_serialize, need_decay = 0; +	u64 max_cost = 0;  	update_blocked_averages(cpu);  	rcu_read_lock();  	for_each_domain(cpu, sd) { +		/* +		 * Decay the newidle max times here because this is a regular +		 * visit to all the domains. Decay ~1% per second. +		 */ +		if (time_after(jiffies, sd->next_decay_max_lb_cost)) { +			sd->max_newidle_lb_cost = +				(sd->max_newidle_lb_cost * 253) / 256; +			sd->next_decay_max_lb_cost = jiffies + HZ; +			need_decay = 1; +		} +		max_cost += sd->max_newidle_lb_cost; +  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; -		interval = sd->balance_interval; -		if (idle != CPU_IDLE) -			interval *= sd->busy_factor; +		/* +		 * Stop the load balance at this level. There is another +		 * CPU in our sched group which is doing load balancing more +		 * actively. +		 */ +		if (!continue_balancing) { +			if (need_decay) +				continue; +			break; +		} -		/* scale ms to jiffies */ -		interval = msecs_to_jiffies(interval); -		interval = clamp(interval, 1UL, max_load_balance_interval); +		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		need_serialize = sd->flags & SD_SERIALIZE; -  		if (need_serialize) {  			if (!spin_trylock(&balancing))  				goto out; @@ -5689,13 +7132,14 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  		if (time_after_eq(jiffies, sd->last_balance + interval)) {  			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {  				/* -				 * The LBF_SOME_PINNED logic could have changed +				 * The LBF_DST_PINNED logic could have changed  				 * env->dst_cpu, so we can't know our idle  				 * state even if we migrated tasks. Update it.  				 */  				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;  			}  			sd->last_balance = jiffies; +			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);  		}  		if (need_serialize)  			spin_unlock(&balancing); @@ -5704,14 +7148,14 @@ out:  			next_balance = sd->last_balance + interval;  			update_next_balance = 1;  		} - +	} +	if (need_decay) {  		/* -		 * Stop the load balance at this level. There is another -		 * CPU in our sched group which is doing load balancing more -		 * actively. +		 * Ensure the rq-wide value also decays but keep it at a +		 * reasonable floor to avoid funnies with rq->avg_idle.  		 */ -		if (!continue_balancing) -			break; +		rq->max_idle_balance_cost = +			max((u64)sysctl_sched_migration_cost, max_cost);  	}  	rcu_read_unlock(); @@ -5729,9 +7173,9 @@ out:   * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the   * rebalancing for all the cpus for whom scheduler ticks are stopped.   */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)  { -	struct rq *this_rq = cpu_rq(this_cpu); +	int this_cpu = this_rq->cpu;  	struct rq *rq;  	int balance_cpu; @@ -5753,12 +7197,17 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)  		rq = cpu_rq(balance_cpu); -		raw_spin_lock_irq(&rq->lock); -		update_rq_clock(rq); -		update_idle_cpu_load(rq); -		raw_spin_unlock_irq(&rq->lock); - -		rebalance_domains(balance_cpu, CPU_IDLE); +		/* +		 * If time for next balance is due, +		 * do the balance. +		 */ +		if (time_after_eq(jiffies, rq->next_balance)) { +			raw_spin_lock_irq(&rq->lock); +			update_rq_clock(rq); +			update_idle_cpu_load(rq); +			raw_spin_unlock_irq(&rq->lock); +			rebalance_domains(rq, CPU_IDLE); +		}  		if (time_after(this_rq->next_balance, rq->next_balance))  			this_rq->next_balance = rq->next_balance; @@ -5773,16 +7222,18 @@ end:   * of an idle cpu is the system.   *   - This rq has more than one task.   *   - At any scheduler domain level, this cpu's scheduler group has multiple - *     busy cpu's exceeding the group's power. + *     busy cpu's exceeding the group's capacity.   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler   *     domain span are idle.   */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq)  {  	unsigned long now = jiffies;  	struct sched_domain *sd; +	struct sched_group_capacity *sgc; +	int nr_busy, cpu = rq->cpu; -	if (unlikely(idle_cpu(cpu))) +	if (unlikely(rq->idle_balance))  		return 0;         /* @@ -5806,22 +7257,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)  		goto need_kick;  	rcu_read_lock(); -	for_each_domain(cpu, sd) { -		struct sched_group *sg = sd->groups; -		struct sched_group_power *sgp = sg->sgp; -		int nr_busy = atomic_read(&sgp->nr_busy_cpus); +	sd = rcu_dereference(per_cpu(sd_busy, cpu)); -		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) -			goto need_kick_unlock; +	if (sd) { +		sgc = sd->groups->sgc; +		nr_busy = atomic_read(&sgc->nr_busy_cpus); -		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight -		    && (cpumask_first_and(nohz.idle_cpus_mask, -					  sched_domain_span(sd)) < cpu)) +		if (nr_busy > 1)  			goto need_kick_unlock; - -		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) -			break;  	} + +	sd = rcu_dereference(per_cpu(sd_asym, cpu)); + +	if (sd && (cpumask_first_and(nohz.idle_cpus_mask, +				  sched_domain_span(sd)) < cpu)) +		goto need_kick_unlock; +  	rcu_read_unlock();  	return 0; @@ -5831,7 +7282,7 @@ need_kick:  	return 1;  }  #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }  #endif  /* @@ -5840,38 +7291,34 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }   */  static void run_rebalance_domains(struct softirq_action *h)  { -	int this_cpu = smp_processor_id(); -	struct rq *this_rq = cpu_rq(this_cpu); +	struct rq *this_rq = this_rq();  	enum cpu_idle_type idle = this_rq->idle_balance ?  						CPU_IDLE : CPU_NOT_IDLE; -	rebalance_domains(this_cpu, idle); +	rebalance_domains(this_rq, idle);  	/*  	 * If this cpu has a pending nohz_balance_kick, then do the  	 * balancing on behalf of the other idle cpus whose ticks are  	 * stopped.  	 */ -	nohz_idle_balance(this_cpu, idle); -} - -static inline int on_null_domain(int cpu) -{ -	return !rcu_dereference_sched(cpu_rq(cpu)->sd); +	nohz_idle_balance(this_rq, idle);  }  /*   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.   */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq)  {  	/* Don't need to rebalance while attached to NULL domain */ -	if (time_after_eq(jiffies, rq->next_balance) && -	    likely(!on_null_domain(cpu))) +	if (unlikely(on_null_domain(rq))) +		return; + +	if (time_after_eq(jiffies, rq->next_balance))  		raise_softirq(SCHED_SOFTIRQ);  #ifdef CONFIG_NO_HZ_COMMON -	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) -		nohz_balancer_kick(cpu); +	if (nohz_kick_needed(rq)) +		nohz_balancer_kick();  #endif  } @@ -5987,15 +7434,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	/* -	 * Ensure the task's vruntime is normalized, so that when its +	 * Ensure the task's vruntime is normalized, so that when it's  	 * switched back to the fair class the enqueue_entity(.flags=0) will  	 * do the right thing.  	 * -	 * If it was on_rq, then the dequeue_entity(.flags=0) will already -	 * have normalized the vruntime, if it was !on_rq, then only when +	 * If it's on_rq, then the dequeue_entity(.flags=0) will already +	 * have normalized the vruntime, if it's !on_rq, then only when  	 * the task is sleeping will it still have non-normalized vruntime.  	 */ -	if (!se->on_rq && p->state != TASK_RUNNING) { +	if (!p->on_rq && p->state != TASK_RUNNING) {  		/*  		 * Fix up our vruntime so that the current sleep doesn't  		 * cause 'unlimited' sleep bonus. @@ -6022,7 +7469,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)   */  static void switched_to_fair(struct rq *rq, struct task_struct *p)  { -	if (!p->se.on_rq) +	struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED +	/* +	 * Since the real-depth could have been changed (only FAIR +	 * class maintain depth value), reset depth properly. +	 */ +	se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif +	if (!se->on_rq)  		return;  	/* @@ -6070,7 +7525,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)  #ifdef CONFIG_FAIR_GROUP_SCHED  static void task_move_group_fair(struct task_struct *p, int on_rq)  { +	struct sched_entity *se = &p->se;  	struct cfs_rq *cfs_rq; +  	/*  	 * If the task was not on the rq at the time of this cgroup movement  	 * it must have been asleep, sleeping tasks keep their ->vruntime @@ -6096,23 +7553,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)  	 * To prevent boost or penalty in the new cfs_rq caused by delta  	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.  	 */ -	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) +	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))  		on_rq = 1;  	if (!on_rq) -		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; +		se->vruntime -= cfs_rq_of(se)->min_vruntime;  	set_task_rq(p, task_cpu(p)); +	se->depth = se->parent ? se->parent->depth + 1 : 0;  	if (!on_rq) { -		cfs_rq = cfs_rq_of(&p->se); -		p->se.vruntime += cfs_rq->min_vruntime; +		cfs_rq = cfs_rq_of(se); +		se->vruntime += cfs_rq->min_vruntime;  #ifdef CONFIG_SMP  		/*  		 * migrate_task_rq_fair() will have removed our previous  		 * contribution, but we must synchronize for ongoing future  		 * decay.  		 */ -		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); -		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); +		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;  #endif  	}  } @@ -6208,13 +7666,17 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  	if (!se)  		return; -	if (!parent) +	if (!parent) {  		se->cfs_rq = &rq->cfs; -	else +		se->depth = 0; +	} else {  		se->cfs_rq = parent->my_q; +		se->depth = parent->depth + 1; +	}  	se->my_q = cfs_rq; -	update_load_set(&se->load, 0); +	/* guarantee group entities always have weight */ +	update_load_set(&se->load, NICE_0_LOAD);  	se->parent = parent;  } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e479..90284d117fe 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)  SCHED_FEAT(WAKEUP_PREEMPTION, true)  /* - * Use arch dependent cpu power functions + * Use arch dependent cpu capacity functions   */ -SCHED_FEAT(ARCH_POWER, true) +SCHED_FEAT(ARCH_CAPACITY, true)  SCHED_FEAT(HRTICK, false)  SCHED_FEAT(DOUBLE_TICK, false)  SCHED_FEAT(LB_BIAS, true)  /* - * Decrement CPU power based on time not spent running tasks + * Decrement CPU capacity based on time not spent running tasks   */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_CAPACITY, true)  /*   * Queue remote wakeups on the target CPU and process them @@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)  /*   * Apply the automatic NUMA scheduling policy. Enabled automatically   * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing=. Allow PTE scanning to be forced on UMA machines - * for debugging the core machinery. + * numa_balancing=   */  #ifdef CONFIG_NUMA_BALANCING  SCHED_FEAT(NUMA,	false) -SCHED_FEAT(NUMA_FORCE,	false) + +/* + * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a + * higher number of hinting faults are recorded during active load + * balancing. + */ +SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) + +/* + * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a + * lower number of hinting faults have been recorded. As this has + * the potential to prevent a task ever migrating to a new node + * due to CPU overload it is disabled by default. + */ +SCHED_FEAT(NUMA_RESIST_LOWER, false)  #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 00000000000..cf009fb0bc2 --- /dev/null +++ b/kernel/sched/idle.c @@ -0,0 +1,273 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/stackprotector.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +#include "sched.h" + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ +	if (enable) { +		cpu_idle_force_poll++; +	} else { +		cpu_idle_force_poll--; +		WARN_ON_ONCE(cpu_idle_force_poll < 0); +	} +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ +	cpu_idle_force_poll = 1; +	return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ +	cpu_idle_force_poll = 0; +	return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ +	rcu_idle_enter(); +	trace_cpu_idle_rcuidle(0, smp_processor_id()); +	local_irq_enable(); +	while (!tif_need_resched()) +		cpu_relax(); +	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); +	rcu_idle_exit(); +	return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ +	cpu_idle_force_poll = 1; +	local_irq_enable(); +} + +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * + * On archs that support TIF_POLLING_NRFLAG, is called with polling + * set, and it returns with polling set.  If it ever stops polling, it + * must clear the polling bit. + */ +static void cpuidle_idle_call(void) +{ +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); +	int next_state, entered_state; +	bool broadcast; + +	/* +	 * Check if the idle task must be rescheduled. If it is the +	 * case, exit the function after re-enabling the local irq. +	 */ +	if (need_resched()) { +		local_irq_enable(); +		return; +	} + +	/* +	 * During the idle period, stop measuring the disabled irqs +	 * critical sections latencies +	 */ +	stop_critical_timings(); + +	/* +	 * Tell the RCU framework we are entering an idle section, +	 * so no more rcu read side critical sections and one more +	 * step to the grace period +	 */ +	rcu_idle_enter(); + +	/* +	 * Ask the cpuidle framework to choose a convenient idle state. +	 * Fall back to the default arch idle method on errors. +	 */ +	next_state = cpuidle_select(drv, dev); +	if (next_state < 0) { +use_default: +		/* +		 * We can't use the cpuidle framework, let's use the default +		 * idle routine. +		 */ +		if (current_clr_polling_and_test()) +			local_irq_enable(); +		else +			arch_cpu_idle(); + +		goto exit_idle; +	} + + +	/* +	 * The idle task must be scheduled, it is pointless to +	 * go to idle, just update no idle residency and get +	 * out of this function +	 */ +	if (current_clr_polling_and_test()) { +		dev->last_residency = 0; +		entered_state = next_state; +		local_irq_enable(); +		goto exit_idle; +	} + +	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + +	/* +	 * Tell the time framework to switch to a broadcast timer +	 * because our local timer will be shutdown. If a local timer +	 * is used from another cpu as a broadcast timer, this call may +	 * fail if it is not available +	 */ +	if (broadcast && +	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) +		goto use_default; + +	trace_cpu_idle_rcuidle(next_state, dev->cpu); + +	/* +	 * Enter the idle state previously returned by the governor decision. +	 * This function will block until an interrupt occurs and will take +	 * care of re-enabling the local interrupts +	 */ +	entered_state = cpuidle_enter(drv, dev, next_state); + +	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + +	if (broadcast) +		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + +	/* +	 * Give the governor an opportunity to reflect on the outcome +	 */ +	cpuidle_reflect(dev, entered_state); + +exit_idle: +	__current_set_polling(); + +	/* +	 * It is up to the idle functions to reenable local interrupts +	 */ +	if (WARN_ON_ONCE(irqs_disabled())) +		local_irq_enable(); + +	rcu_idle_exit(); +	start_critical_timings(); +} + +/* + * Generic idle loop implementation + * + * Called with polling cleared. + */ +static void cpu_idle_loop(void) +{ +	while (1) { +		/* +		 * If the arch has a polling bit, we maintain an invariant: +		 * +		 * Our polling bit is clear if we're not scheduled (i.e. if +		 * rq->curr != rq->idle).  This means that, if rq->idle has +		 * the polling bit set, then setting need_resched is +		 * guaranteed to cause the cpu to reschedule. +		 */ + +		__current_set_polling(); +		tick_nohz_idle_enter(); + +		while (!need_resched()) { +			check_pgt_cache(); +			rmb(); + +			if (cpu_is_offline(smp_processor_id())) +				arch_cpu_idle_dead(); + +			local_irq_disable(); +			arch_cpu_idle_enter(); + +			/* +			 * In poll mode we reenable interrupts and spin. +			 * +			 * Also if we detected in the wakeup from idle +			 * path that the tick broadcast device expired +			 * for us, we don't want to go deep idle as we +			 * know that the IPI is going to arrive right +			 * away +			 */ +			if (cpu_idle_force_poll || tick_check_broadcast_expired()) +				cpu_idle_poll(); +			else +				cpuidle_idle_call(); + +			arch_cpu_idle_exit(); +		} + +		/* +		 * Since we fell out of the loop above, we know +		 * TIF_NEED_RESCHED must be set, propagate it into +		 * PREEMPT_NEED_RESCHED. +		 * +		 * This is required because for polling idle loops we will +		 * not have had an IPI to fold the state for us. +		 */ +		preempt_set_need_resched(); +		tick_nohz_idle_exit(); +		__current_clr_polling(); + +		/* +		 * We promise to call sched_ttwu_pending and reschedule +		 * if need_resched is set while polling is set.  That +		 * means that clearing polling needs to be visible +		 * before doing these things. +		 */ +		smp_mb__after_atomic(); + +		sched_ttwu_pending(); +		schedule_preempt_disabled(); +	} +} + +void cpu_startup_entry(enum cpuhp_state state) +{ +	/* +	 * This #ifdef needs to die, but it's too late in the cycle to +	 * make this generic (arm and sh have never invoked the canary +	 * init for the non boot cpus!). Will be fixed in 3.11 +	 */ +#ifdef CONFIG_X86 +	/* +	 * If we're the non-boot CPU, nothing set the stack canary up +	 * for us. The boot CPU already has it initialized but no harm +	 * in doing it again. This is a good place for updating it, as +	 * we wont ever return from this function (so the invalid +	 * canaries already on the stack wont ever trigger). +	 */ +	boot_init_stack_canary(); +#endif +	arch_cpu_idle_prepare(); +	cpu_idle_loop(); +} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d3..879f2b75266 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,22 +9,12 @@  #ifdef CONFIG_SMP  static int -select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)  {  	return task_cpu(p); /* IDLE tasks as never migrated */  } - -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) -{ -	idle_exit_fair(rq); -	rq_last_tick_reset(rq); -} - -static void post_schedule_idle(struct rq *rq) -{ -	idle_enter_fair(rq); -}  #endif /* CONFIG_SMP */ +  /*   * Idle tasks are unconditionally rescheduled:   */ @@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl  	resched_task(rq->idle);  } -static struct task_struct *pick_next_task_idle(struct rq *rq) +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev)  { +	put_prev_task(rq, prev); +  	schedstat_inc(rq, sched_goidle); -#ifdef CONFIG_SMP -	/* Trigger the post schedule to do an idle_enter for CFS */ -	rq->post_schedule = 1; -#endif  	return rq->idle;  } @@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)  static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)  { +	idle_exit_fair(rq); +	rq_last_tick_reset(rq);  }  static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) @@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_idle, -	.pre_schedule		= pre_schedule_idle, -	.post_schedule		= post_schedule_idle,  #endif  	.set_curr_task          = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64d..a49083192c6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  	rt_rq->overloaded = 0;  	plist_head_init(&rt_rq->pushable_tasks);  #endif +	/* We start is dequeued state, because no RT tasks are queued */ +	rt_rq->rt_queued = 0;  	rt_rq->rt_time = 0;  	rt_rq->rt_throttled = 0; @@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)  	return rt_se->rt_rq;  } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *rt_rq = rt_se->rt_rq; + +	return rt_rq->rq; +} +  void free_rt_sched_group(struct task_group *tg)  {  	int i; @@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)  	return container_of(rt_rq, struct rq, rt);  } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)  {  	struct task_struct *p = rt_task_of(rt_se); -	struct rq *rq = task_rq(p); + +	return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ +	struct rq *rq = rq_of_rt_se(rt_se);  	return &rq->rt;  } @@ -229,6 +244,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)  #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ +	/* Try to pull RT tasks here if we lower this rq's prio */ +	return rq->rt.highest_prio.curr > prev->prio; +} +  static inline int rt_overloaded(struct rq *rq)  {  	return atomic_read(&rq->rd->rto_count); @@ -246,8 +269,10 @@ static inline void rt_set_overload(struct rq *rq)  	 * if we should look at the mask. It would be a shame  	 * if we looked at the mask, but the mask was not  	 * updated yet. +	 * +	 * Matched by the barrier in pull_rt_task().  	 */ -	wmb(); +	smp_wmb();  	atomic_inc(&rq->rd->rto_count);  } @@ -313,6 +338,15 @@ static inline int has_pushable_tasks(struct rq *rq)  	return !plist_head_empty(&rq->rt.pushable_tasks);  } +static inline void set_post_schedule(struct rq *rq) +{ +	/* +	 * We detect this state here so that we can avoid taking the RQ +	 * lock again later if there is no need to push +	 */ +	rq->post_schedule = has_pushable_tasks(rq); +} +  static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)  {  	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -357,8 +391,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ +	return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ +	return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +}  #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +  static inline int on_rt_rq(struct sched_rt_entity *rt_se)  {  	return !list_empty(&rt_se->run_list); @@ -420,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  	rt_se = rt_rq->tg->rt_se[cpu];  	if (rt_rq->rt_nr_running) { -		if (rt_se && !on_rt_rq(rt_se)) +		if (!rt_se) +			enqueue_top_rt_rq(rt_rq); +		else if (!on_rt_rq(rt_se))  			enqueue_rt_entity(rt_se, false); +  		if (rt_rq->highest_prio.curr < curr->prio)  			resched_task(curr);  	} @@ -434,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  	rt_se = rt_rq->tg->rt_se[cpu]; -	if (rt_se && on_rt_rq(rt_se)) +	if (!rt_se) +		dequeue_top_rt_rq(rt_rq); +	else if (on_rt_rq(rt_se))  		dequeue_rt_entity(rt_se);  } @@ -505,12 +560,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)  static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  { -	if (rt_rq->rt_nr_running) -		resched_task(rq_of_rt_rq(rt_rq)->curr); +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	if (!rt_rq->rt_nr_running) +		return; + +	enqueue_top_rt_rq(rt_rq); +	resched_task(rq->curr);  }  static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)  { +	dequeue_top_rt_rq(rt_rq);  }  static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -536,6 +597,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)  #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ +	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + +	return (hrtimer_active(&rt_b->rt_period_timer) || +		rt_rq->rt_time < rt_b->rt_runtime); +} +  #ifdef CONFIG_SMP  /*   * We ran out of runtime, see if we can borrow some from our neighbours. @@ -821,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)  		 * but accrue some time due to boosting.  		 */  		if (likely(rt_b->rt_runtime)) { -			static bool once = false; -  			rt_rq->rt_throttled = 1; - -			if (!once) { -				once = true; -				printk_sched("sched: RT throttling activated\n"); -			} +			printk_deferred_once("sched: RT throttling activated\n");  		} else {  			/*  			 * In case we did anyway, make it go away, @@ -855,7 +918,6 @@ static void update_curr_rt(struct rq *rq)  {  	struct task_struct *curr = rq->curr;  	struct sched_rt_entity *rt_se = &curr->rt; -	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);  	u64 delta_exec;  	if (curr->sched_class != &rt_sched_class) @@ -880,7 +942,7 @@ static void update_curr_rt(struct rq *rq)  		return;  	for_each_sched_rt_entity(rt_se) { -		rt_rq = rt_rq_of_se(rt_se); +		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);  		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {  			raw_spin_lock(&rt_rq->rt_runtime_lock); @@ -892,6 +954,38 @@ static void update_curr_rt(struct rq *rq)  	}  } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	BUG_ON(&rq->rt != rt_rq); + +	if (!rt_rq->rt_queued) +		return; + +	BUG_ON(!rq->nr_running); + +	sub_nr_running(rq, rt_rq->rt_nr_running); +	rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ +	struct rq *rq = rq_of_rt_rq(rt_rq); + +	BUG_ON(&rq->rt != rt_rq); + +	if (rt_rq->rt_queued) +		return; +	if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) +		return; + +	add_nr_running(rq, rt_rq->rt_nr_running); +	rt_rq->rt_queued = 1; +} +  #if defined CONFIG_SMP  static void @@ -899,6 +993,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)  {  	struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED +	/* +	 * Change rq's cpupri only if rt_rq is the top queue. +	 */ +	if (&rq->rt != rt_rq) +		return; +#endif  	if (rq->online && prio < prev_prio)  		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);  } @@ -908,6 +1009,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)  {  	struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED +	/* +	 * Change rq's cpupri only if rt_rq is the top queue. +	 */ +	if (&rq->rt != rt_rq) +		return; +#endif  	if (rq->online && rt_rq->highest_prio.curr != prev_prio)  		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);  } @@ -1001,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}  #endif /* CONFIG_RT_GROUP_SCHED */  static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ +	struct rt_rq *group_rq = group_rt_rq(rt_se); + +	if (group_rq) +		return group_rq->rt_nr_running; +	else +		return 1; +} + +static inline  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	int prio = rt_se_prio(rt_se);  	WARN_ON(!rt_prio(prio)); -	rt_rq->rt_nr_running++; +	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);  	inc_rt_prio(rt_rq, prio);  	inc_rt_migration(rt_se, rt_rq); @@ -1018,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)  {  	WARN_ON(!rt_prio(rt_se_prio(rt_se)));  	WARN_ON(!rt_rq->rt_nr_running); -	rt_rq->rt_nr_running--; +	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);  	dec_rt_prio(rt_rq, rt_se_prio(rt_se));  	dec_rt_migration(rt_se, rt_rq); @@ -1075,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  		back = rt_se;  	} +	dequeue_top_rt_rq(rt_rq_of_se(back)); +  	for (rt_se = back; rt_se; rt_se = rt_se->back) {  		if (on_rt_rq(rt_se))  			__dequeue_rt_entity(rt_se); @@ -1083,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)  static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)  { +	struct rq *rq = rq_of_rt_se(rt_se); +  	dequeue_rt_stack(rt_se);  	for_each_sched_rt_entity(rt_se)  		__enqueue_rt_entity(rt_se, head); +	enqueue_top_rt_rq(&rq->rt);  }  static void dequeue_rt_entity(struct sched_rt_entity *rt_se)  { +	struct rq *rq = rq_of_rt_se(rt_se); +  	dequeue_rt_stack(rt_se);  	for_each_sched_rt_entity(rt_se) { @@ -1098,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)  		if (rt_rq && rt_rq->rt_nr_running)  			__enqueue_rt_entity(rt_se, false);  	} +	enqueue_top_rt_rq(&rq->rt);  }  /* @@ -1115,8 +1242,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)  		enqueue_pushable_task(rq, p); - -	inc_nr_running(rq);  }  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1127,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)  	dequeue_rt_entity(rt_se);  	dequeue_pushable_task(rq, p); - -	dec_nr_running(rq);  }  /* @@ -1169,13 +1292,10 @@ static void yield_task_rt(struct rq *rq)  static int find_lowest_rq(struct task_struct *task);  static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)  {  	struct task_struct *curr;  	struct rq *rq; -	int cpu; - -	cpu = task_cpu(p);  	if (p->nr_cpus_allowed == 1)  		goto out; @@ -1213,8 +1333,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)  	 */  	if (curr && unlikely(rt_task(curr)) &&  	    (curr->nr_cpus_allowed < 2 || -	     curr->prio <= p->prio) && -	    (p->nr_cpus_allowed > 1)) { +	     curr->prio <= p->prio)) {  		int target = find_lowest_rq(p);  		if (target != -1) @@ -1298,15 +1417,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)  {  	struct sched_rt_entity *rt_se;  	struct task_struct *p; -	struct rt_rq *rt_rq; - -	rt_rq = &rq->rt; - -	if (!rt_rq->rt_nr_running) -		return NULL; - -	if (rt_rq_throttled(rt_rq)) -		return NULL; +	struct rt_rq *rt_rq  = &rq->rt;  	do {  		rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1320,21 +1431,43 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)  	return p;  } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev)  { -	struct task_struct *p = _pick_next_task_rt(rq); +	struct task_struct *p; +	struct rt_rq *rt_rq = &rq->rt; + +	if (need_pull_rt_task(rq, prev)) { +		pull_rt_task(rq); +		/* +		 * pull_rt_task() can drop (and re-acquire) rq->lock; this +		 * means a dl or stop task can slip in, in which case we need +		 * to re-start task selection. +		 */ +		if (unlikely((rq->stop && rq->stop->on_rq) || +			     rq->dl.dl_nr_running)) +			return RETRY_TASK; +	} + +	/* +	 * We may dequeue prev's rt_rq in put_prev_task(). +	 * So, we update time before rt_nr_running check. +	 */ +	if (prev->sched_class == &rt_sched_class) +		update_curr_rt(rq); + +	if (!rt_rq->rt_queued) +		return NULL; + +	put_prev_task(rq, prev); + +	p = _pick_next_task_rt(rq);  	/* The running task is never eligible for pushing */  	if (p)  		dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP -	/* -	 * We detect this state here so that we can avoid taking the RQ -	 * lock again later if there is no need to push -	 */ -	rq->post_schedule = has_pushable_tasks(rq); -#endif +	set_post_schedule(rq);  	return p;  } @@ -1630,6 +1763,12 @@ static int pull_rt_task(struct rq *this_rq)  	if (likely(!rt_overloaded(this_rq)))  		return 0; +	/* +	 * Match the barrier from rt_set_overloaded; this guarantees that if we +	 * see overloaded we must also see the rto_mask bit. +	 */ +	smp_rmb(); +  	for_each_cpu(cpu, this_rq->rd->rto_mask) {  		if (this_cpu == cpu)  			continue; @@ -1698,13 +1837,6 @@ skip:  	return ret;  } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ -	/* Try to pull RT tasks here if we lower this rq's prio */ -	if (rq->rt.highest_prio.curr > prev->prio) -		pull_rt_task(rq); -} -  static void post_schedule_rt(struct rq *rq)  {  	push_rt_tasks(rq); @@ -1720,7 +1852,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)  	    !test_tsk_need_resched(rq->curr) &&  	    has_pushable_tasks(rq) &&  	    p->nr_cpus_allowed > 1 && -	    rt_task(rq->curr) && +	    (dl_task(rq->curr) || rt_task(rq->curr)) &&  	    (rq->curr->nr_cpus_allowed < 2 ||  	     rq->curr->prio <= p->prio))  		push_rt_tasks(rq); @@ -1807,7 +1939,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)  		resched_task(rq->curr);  } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void)  {  	unsigned int i; @@ -1836,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)  	 */  	if (p->on_rq && rq->curr != p) {  #ifdef CONFIG_SMP -		if (rq->rt.overloaded && push_rt_task(rq) && +		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&  		    /* Don't resched if we changed runqueues */ -		    rq != task_rq(p)) +		    push_rt_task(rq) && rq != task_rq(p))  			check_resched = 0;  #endif /* CONFIG_SMP */  		if (check_resched && p->prio < rq->curr->prio) @@ -1931,8 +2063,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	p->rt.time_slice = sched_rr_timeslice;  	/* -	 * Requeue to the end of queue if we (and all of our ancestors) are the -	 * only element on the queue +	 * Requeue to the end of queue if we (and all of our ancestors) are not +	 * the only element on the queue  	 */  	for_each_sched_rt_entity(rt_se) {  		if (rt_se->run_list.prev != rt_se->run_list.next) { @@ -1981,7 +2113,6 @@ const struct sched_class rt_sched_class = {  	.set_cpus_allowed       = set_cpus_allowed_rt,  	.rq_online              = rq_online_rt,  	.rq_offline             = rq_offline_rt, -	.pre_schedule		= pre_schedule_rt,  	.post_schedule		= post_schedule_rt,  	.task_woken		= task_woken_rt,  	.switched_from		= switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dc..31cc02ebc54 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,12 +2,15 @@  #include <linux/sched.h>  #include <linux/sched/sysctl.h>  #include <linux/sched/rt.h> +#include <linux/sched/deadline.h>  #include <linux/mutex.h>  #include <linux/spinlock.h>  #include <linux/stop_machine.h>  #include <linux/tick.h> +#include <linux/slab.h>  #include "cpupri.h" +#include "cpudeadline.h"  #include "cpuacct.h"  struct rq; @@ -21,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);  extern void update_cpu_load_active(struct rq *this_rq);  /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p)		((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO)) - -/*   * Helpers for converting nanosecond timing to jiffy resolution   */  #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) @@ -72,6 +57,13 @@ extern void update_cpu_load_active(struct rq *this_rq);  #define NICE_0_SHIFT		SCHED_LOAD_SHIFT  /* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9  -> just above 0.5us + */ +#define DL_SCALE (10) + +/*   * These are the 'tuning knobs' of the scheduler:   */ @@ -80,11 +72,19 @@ extern void update_cpu_load_active(struct rq *this_rq);   */  #define RUNTIME_INF	((u64)~0ULL) +static inline int fair_policy(int policy) +{ +	return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} +  static inline int rt_policy(int policy)  { -	if (policy == SCHED_FIFO || policy == SCHED_RR) -		return 1; -	return 0; +	return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ +	return policy == SCHED_DEADLINE;  }  static inline int task_has_rt_policy(struct task_struct *p) @@ -92,6 +92,25 @@ static inline int task_has_rt_policy(struct task_struct *p)  	return rt_policy(p->policy);  } +static inline int task_has_dl_policy(struct task_struct *p) +{ +	return dl_policy(p->policy); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ +	return (s64)(a - b) < 0; +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ +	return dl_time_before(a->deadline, b->deadline); +} +  /*   * This is the priority-queue data structure of the RT scheduling class:   */ @@ -107,6 +126,47 @@ struct rt_bandwidth {  	u64			rt_runtime;  	struct hrtimer		rt_period_timer;  }; +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + *  - store the maximum -deadline bandwidth of the system (the group); + *  - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + *  - dl_total_bw array contains, in the i-eth element, the currently + *    allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { +	raw_spinlock_t dl_runtime_lock; +	u64 dl_runtime; +	u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ +	return sysctl_sched_rt_runtime >= 0; +} + +extern struct dl_bw *dl_bw_of(int i); + +struct dl_bw { +	raw_spinlock_t lock; +	u64 bw, total_bw; +};  extern struct mutex sched_domains_mutex; @@ -218,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);  extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);  extern void free_rt_sched_group(struct task_group *tg); @@ -349,6 +409,8 @@ struct rt_rq {  	int overloaded;  	struct plist_head pushable_tasks;  #endif +	int rt_queued; +  	int rt_throttled;  	u64 rt_time;  	u64 rt_runtime; @@ -363,6 +425,41 @@ struct rt_rq {  #endif  }; +/* Deadline class' related fields in a runqueue */ +struct dl_rq { +	/* runqueue is an rbtree, ordered by deadline */ +	struct rb_root rb_root; +	struct rb_node *rb_leftmost; + +	unsigned long dl_nr_running; + +#ifdef CONFIG_SMP +	/* +	 * Deadline values of the currently executing and the +	 * earliest ready task on this rq. Caching these facilitates +	 * the decision wether or not a ready but not running task +	 * should migrate somewhere else. +	 */ +	struct { +		u64 curr; +		u64 next; +	} earliest_dl; + +	unsigned long dl_nr_migratory; +	int overloaded; + +	/* +	 * Tasks on this rq that can be pushed away. They are kept in +	 * an rb-tree, ordered by tasks' deadlines, with caching +	 * of the leftmost (earliest deadline) element. +	 */ +	struct rb_root pushable_dl_tasks_root; +	struct rb_node *pushable_dl_tasks_leftmost; +#else +	struct dl_bw dl_bw; +#endif +}; +  #ifdef CONFIG_SMP  /* @@ -381,6 +478,15 @@ struct root_domain {  	cpumask_var_t online;  	/* +	 * The bit corresponding to a CPU gets set here if such CPU has more +	 * than one runnable -deadline task (as it is below for RT tasks). +	 */ +	cpumask_var_t dlo_mask; +	atomic_t dlo_count; +	struct dl_bw dl_bw; +	struct cpudl cpudl; + +	/*  	 * The "RT overload" flag: it gets set if a CPU has more than  	 * one runnable RT task.  	 */ @@ -408,6 +514,10 @@ struct rq {  	 * remote CPUs use both these fields when doing load calculation.  	 */  	unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING +	unsigned int nr_numa_running; +	unsigned int nr_preferred_running; +#endif  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX];  	unsigned long last_load_update_tick; @@ -427,15 +537,14 @@ struct rq {  	struct cfs_rq cfs;  	struct rt_rq rt; +	struct dl_rq dl;  #ifdef CONFIG_FAIR_GROUP_SCHED  	/* list of leaf cfs_rq on this cpu: */  	struct list_head leaf_cfs_rq_list; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED -	struct list_head leaf_rt_rq_list; -#endif +	struct sched_avg avg; +#endif /* CONFIG_FAIR_GROUP_SCHED */  	/*  	 * This is part of a global counter where only the total sum @@ -458,7 +567,7 @@ struct rq {  	struct root_domain *rd;  	struct sched_domain *sd; -	unsigned long cpu_power; +	unsigned long cpu_capacity;  	unsigned char idle_balance;  	/* For active balancing */ @@ -476,6 +585,9 @@ struct rq {  	u64 age_stamp;  	u64 idle_stamp;  	u64 avg_idle; + +	/* This is used to determine avg_idle's max value */ +	u64 max_idle_balance_cost;  #endif  #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -521,8 +633,6 @@ struct rq {  #ifdef CONFIG_SMP  	struct llist_head wake_list;  #endif - -	struct sched_avg avg;  };  static inline int cpu_of(struct rq *rq) @@ -552,8 +662,16 @@ static inline u64 rq_clock_task(struct rq *rq)  	return rq->clock_task;  } +#ifdef CONFIG_NUMA_BALANCING +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *, struct task_struct *); +#endif /* CONFIG_NUMA_BALANCING */ +  #ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); +  #define rcu_dereference_check_sched_domain(p) \  	rcu_dereference_check((p), \  			      lockdep_is_held(&sched_domains_mutex)) @@ -593,18 +711,34 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)  	return hsd;  } +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ +	struct sched_domain *sd; + +	for_each_domain(cpu, sd) { +		if (sd->flags & flag) +			break; +	} + +	return sd; +} +  DECLARE_PER_CPU(struct sched_domain *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_size);  DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain *, sd_numa); +DECLARE_PER_CPU(struct sched_domain *, sd_busy); +DECLARE_PER_CPU(struct sched_domain *, sd_asym); -struct sched_group_power { +struct sched_group_capacity {  	atomic_t ref;  	/* -	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a -	 * single CPU. +	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity +	 * for a single CPU.  	 */ -	unsigned int power, power_orig; +	unsigned int capacity, capacity_orig;  	unsigned long next_update; +	int imbalance; /* XXX unrelated to capacity but shared group state */  	/*  	 * Number of busy cpus in this group.  	 */ @@ -618,7 +752,7 @@ struct sched_group {  	atomic_t ref;  	unsigned int group_weight; -	struct sched_group_power *sgp; +	struct sched_group_capacity *sgc;  	/*  	 * The CPUs this group covers. @@ -641,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)   */  static inline struct cpumask *sched_group_mask(struct sched_group *sg)  { -	return to_cpumask(sg->sgp->cpumask); +	return to_cpumask(sg->sgc->cpumask);  }  /** @@ -655,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)  extern int group_balance_cpu(struct sched_group *sg); +#else + +static inline void sched_ttwu_pending(void) { } +  #endif /* CONFIG_SMP */  #include "stats.h" @@ -719,6 +857,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)  	 */  	smp_wmb();  	task_thread_info(p)->cpu = cpu; +	p->wake_cpu = cpu;  #endif  } @@ -796,8 +935,6 @@ static inline u64 global_rt_runtime(void)  	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;  } - -  static inline int task_current(struct rq *rq, struct task_struct *p)  {  	return rq->curr == p; @@ -957,9 +1094,12 @@ static const u32 prio_to_wmult[40] = {  #else  #define ENQUEUE_WAKING		0  #endif +#define ENQUEUE_REPLENISH	8  #define DEQUEUE_SLEEP		1 +#define RETRY_TASK		((void *)-1UL) +  struct sched_class {  	const struct sched_class *next; @@ -970,14 +1110,22 @@ struct sched_class {  	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); -	struct task_struct * (*pick_next_task) (struct rq *rq); +	/* +	 * It is the responsibility of the pick_next_task() method that will +	 * return the next task to call put_prev_task() on the @prev task or +	 * something equivalent. +	 * +	 * May return RETRY_TASK when it finds a higher prio class has runnable +	 * tasks. +	 */ +	struct task_struct * (*pick_next_task) (struct rq *rq, +						struct task_struct *prev);  	void (*put_prev_task) (struct rq *rq, struct task_struct *p);  #ifdef CONFIG_SMP -	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); +	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);  	void (*migrate_task_rq)(struct task_struct *p, int next_cpu); -	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);  	void (*post_schedule) (struct rq *this_rq);  	void (*task_waking) (struct task_struct *task);  	void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -992,6 +1140,7 @@ struct sched_class {  	void (*set_curr_task) (struct rq *rq);  	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);  	void (*task_fork) (struct task_struct *p); +	void (*task_dead) (struct task_struct *p);  	void (*switched_from) (struct rq *this_rq, struct task_struct *task);  	void (*switched_to) (struct rq *this_rq, struct task_struct *task); @@ -1006,11 +1155,17 @@ struct sched_class {  #endif  }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ +	prev->sched_class->put_prev_task(rq, prev); +} +  #define sched_class_highest (&stop_sched_class)  #define for_each_class(class) \     for (class = sched_class_highest; class; class = class->next)  extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class;  extern const struct sched_class rt_sched_class;  extern const struct sched_class fair_sched_class;  extern const struct sched_class idle_sched_class; @@ -1018,27 +1173,28 @@ extern const struct sched_class idle_sched_class;  #ifdef CONFIG_SMP -extern void update_group_power(struct sched_domain *sd, int cpu); +extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq, int cpu); -extern void idle_balance(int this_cpu, struct rq *this_rq); +extern void trigger_load_balance(struct rq *rq);  extern void idle_enter_fair(struct rq *this_rq);  extern void idle_exit_fair(struct rq *this_rq); -#else	/* CONFIG_SMP */ +#else -static inline void idle_balance(int cpu, struct rq *rq) -{ -} +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { }  #endif  extern void sysrq_sched_debug_show(void);  extern void sched_init_granularity(void);  extern void update_max_interval(void); + +extern void init_sched_dl_class(void);  extern void init_sched_rt_class(void);  extern void init_sched_fair_class(void); +extern void init_sched_dl_class(void);  extern void resched_task(struct task_struct *p);  extern void resched_cpu(int cpu); @@ -1046,26 +1202,24 @@ extern void resched_cpu(int cpu);  extern struct rt_bandwidth def_rt_bandwidth;  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); + +unsigned long to_ratio(u64 period, u64 runtime); +  extern void update_idle_cpu_load(struct rq *this_rq);  extern void init_task_runnable_average(struct task_struct *p); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) +static inline void add_nr_running(struct rq *rq, unsigned count)  { -	if (unlikely(steal > NSEC_PER_SEC)) -		return div_u64(steal, TICK_NSEC); +	unsigned prev_nr = rq->nr_running; -	return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - -static inline void inc_nr_running(struct rq *rq) -{ -	rq->nr_running++; +	rq->nr_running = prev_nr + count;  #ifdef CONFIG_NO_HZ_FULL -	if (rq->nr_running == 2) { +	if (prev_nr < 2 && rq->nr_running >= 2) {  		if (tick_nohz_full_cpu(rq->cpu)) {  			/* Order rq->nr_running write against the IPI */  			smp_wmb(); @@ -1075,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq)  #endif  } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count)  { -	rq->nr_running--; +	rq->nr_running -= count;  }  static inline void rq_last_tick_reset(struct rq *rq) @@ -1220,6 +1374,33 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)  	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);  } +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ +	if (l1 > l2) +		swap(l1, l2); + +	spin_lock(l1); +	spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ +	if (l1 > l2) +		swap(l1, l2); + +	spin_lock_irq(l1); +	spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ +	if (l1 > l2) +		swap(l1, l2); + +	raw_spin_lock(l1); +	raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} +  /*   * double_rq_lock - safely lock two runqueues   * @@ -1304,8 +1485,10 @@ extern void print_rt_stats(struct seq_file *m, int cpu);  extern void init_cfs_rq(struct cfs_rq *cfs_rq);  extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); -extern void account_cfs_bandwidth_used(int enabled, int was_enabled); +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void);  #ifdef CONFIG_NO_HZ_COMMON  enum rq_nohz_flag_bits { diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8..a476bea17fb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)  	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);  	return 0;  } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce..4ab70433965 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)   * from dequeue_task() to account for possible rq->clock skew across cpus. The   * delta taken on each cpu would annul the skew.   */ -static inline void sched_info_dequeued(struct task_struct *t) +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)  { -	unsigned long long now = rq_clock(task_rq(t)), delta = 0; +	unsigned long long now = rq_clock(rq), delta = 0;  	if (unlikely(sched_info_on()))  		if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)  	sched_info_reset_dequeued(t);  	t->sched_info.run_delay += delta; -	rq_sched_info_dequeued(task_rq(t), delta); +	rq_sched_info_dequeued(rq, delta);  }  /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)   * long it was waiting to run.  We also note when it began so that we   * can keep stats on how long its timeslice is.   */ -static void sched_info_arrive(struct task_struct *t) +static void sched_info_arrive(struct rq *rq, struct task_struct *t)  { -	unsigned long long now = rq_clock(task_rq(t)), delta = 0; +	unsigned long long now = rq_clock(rq), delta = 0;  	if (t->sched_info.last_queued)  		delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)  	t->sched_info.last_arrival = now;  	t->sched_info.pcount++; -	rq_sched_info_arrive(task_rq(t), delta); +	rq_sched_info_arrive(rq, delta);  }  /* @@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)   * the timestamp if it is already not set.  It's assumed that   * sched_info_dequeued() will clear that stamp when appropriate.   */ -static inline void sched_info_queued(struct task_struct *t) +static inline void sched_info_queued(struct rq *rq, struct task_struct *t)  {  	if (unlikely(sched_info_on()))  		if (!t->sched_info.last_queued) -			t->sched_info.last_queued = rq_clock(task_rq(t)); +			t->sched_info.last_queued = rq_clock(rq);  }  /* @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)   * sched_info_queued() to mark that it has now again started waiting on   * the runqueue.   */ -static inline void sched_info_depart(struct task_struct *t) +static inline void sched_info_depart(struct rq *rq, struct task_struct *t)  { -	unsigned long long delta = rq_clock(task_rq(t)) - +	unsigned long long delta = rq_clock(rq) -  					t->sched_info.last_arrival; -	rq_sched_info_depart(task_rq(t), delta); +	rq_sched_info_depart(rq, delta);  	if (t->state == TASK_RUNNING) -		sched_info_queued(t); +		sched_info_queued(rq, t);  }  /* @@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)   * the idle task.)  We are only called when prev != next.   */  static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, +		    struct task_struct *prev, struct task_struct *next)  { -	struct rq *rq = task_rq(prev); -  	/*  	 * prev now departs the cpu.  It's not interesting to record  	 * stats about how efficient we were at scheduling the idle  	 * process, however.  	 */  	if (prev != rq->idle) -		sched_info_depart(prev); +		sched_info_depart(rq, prev);  	if (next != rq->idle) -		sched_info_arrive(next); +		sched_info_arrive(rq, next);  }  static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, +		  struct task_struct *prev, struct task_struct *next)  {  	if (unlikely(sched_info_on())) -		__sched_info_switch(prev, next); +		__sched_info_switch(rq, prev, next);  }  #else -#define sched_info_queued(t)			do { } while (0) +#define sched_info_queued(rq, t)		do { } while (0)  #define sched_info_reset_dequeued(t)	do { } while (0) -#define sched_info_dequeued(t)			do { } while (0) -#define sched_info_switch(t, next)		do { } while (0) +#define sched_info_dequeued(rq, t)		do { } while (0) +#define sched_info_depart(rq, t)		do { } while (0) +#define sched_info_arrive(rq, next)		do { } while (0) +#define sched_info_switch(rq, t, next)		do { } while (0)  #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */  /* diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b..bfe0edadbfb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@  #ifdef CONFIG_SMP  static int -select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)  {  	return task_cpu(p); /* stop tasks as never migrate */  } @@ -23,28 +23,31 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)  	/* we're never preempted */  } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct * +pick_next_task_stop(struct rq *rq, struct task_struct *prev)  {  	struct task_struct *stop = rq->stop; -	if (stop && stop->on_rq) { -		stop->se.exec_start = rq_clock_task(rq); -		return stop; -	} +	if (!stop || !stop->on_rq) +		return NULL; -	return NULL; +	put_prev_task(rq, prev); + +	stop->se.exec_start = rq_clock_task(rq); + +	return stop;  }  static void  enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)  { -	inc_nr_running(rq); +	add_nr_running(rq, 1);  }  static void  dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)  { -	dec_nr_running(rq); +	sub_nr_running(rq, 1);  }  static void yield_task_stop(struct rq *rq) @@ -103,7 +106,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)   * Simple, special scheduling class for the per-CPU stop tasks:   */  const struct sched_class stop_sched_class = { -	.next			= &rt_sched_class, +	.next			= &dl_sched_class,  	.enqueue_task		= enqueue_task_stop,  	.dequeue_task		= dequeue_task_stop, diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c new file mode 100644 index 00000000000..0ffa20ae657 --- /dev/null +++ b/kernel/sched/wait.c @@ -0,0 +1,504 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 Nadia Yvette Chambers, Oracle + */ +#include <linux/init.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/wait.h> +#include <linux/hash.h> + +void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +{ +	spin_lock_init(&q->lock); +	lockdep_set_class_and_name(&q->lock, key, name); +	INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(__init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ +	unsigned long flags; + +	wait->flags &= ~WQ_FLAG_EXCLUSIVE; +	spin_lock_irqsave(&q->lock, flags); +	__add_wait_queue(q, wait); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ +	unsigned long flags; + +	wait->flags |= WQ_FLAG_EXCLUSIVE; +	spin_lock_irqsave(&q->lock, flags); +	__add_wait_queue_tail(q, wait); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ +	unsigned long flags; + +	spin_lock_irqsave(&q->lock, flags); +	__remove_wait_queue(q, wait); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, int wake_flags, void *key) +{ +	wait_queue_t *curr, *next; + +	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { +		unsigned flags = curr->flags; + +		if (curr->func(curr, mode, wake_flags, key) && +				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) +			break; +	} +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, 0, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ +	__wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ +	__wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; +	int wake_flags = 1; /* XXX WF_SYNC */ + +	if (unlikely(!q)) +		return; + +	if (unlikely(nr_exclusive != 1)) +		wake_flags = 0; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ +	__wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ +	unsigned long flags; + +	wait->flags &= ~WQ_FLAG_EXCLUSIVE; +	spin_lock_irqsave(&q->lock, flags); +	if (list_empty(&wait->task_list)) +		__add_wait_queue(q, wait); +	set_current_state(state); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ +	unsigned long flags; + +	wait->flags |= WQ_FLAG_EXCLUSIVE; +	spin_lock_irqsave(&q->lock, flags); +	if (list_empty(&wait->task_list)) +		__add_wait_queue_tail(q, wait); +	set_current_state(state); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ +	unsigned long flags; + +	if (signal_pending_state(state, current)) +		return -ERESTARTSYS; + +	wait->private = current; +	wait->func = autoremove_wake_function; + +	spin_lock_irqsave(&q->lock, flags); +	if (list_empty(&wait->task_list)) { +		if (wait->flags & WQ_FLAG_EXCLUSIVE) +			__add_wait_queue_tail(q, wait); +		else +			__add_wait_queue(q, wait); +	} +	set_current_state(state); +	spin_unlock_irqrestore(&q->lock, flags); + +	return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); + +/** + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ +	unsigned long flags; + +	__set_current_state(TASK_RUNNING); +	/* +	 * We can check for list emptiness outside the lock +	 * IFF: +	 *  - we use the "careful" check that verifies both +	 *    the next and prev pointers, so that there cannot +	 *    be any half-pending updates in progress on other +	 *    CPU's that we haven't seen yet (and that might +	 *    still change the stack area. +	 * and +	 *  - all other users take the lock (ie we can only +	 *    have _one_ other CPU that looks at or modifies +	 *    the list). +	 */ +	if (!list_empty_careful(&wait->task_list)) { +		spin_lock_irqsave(&q->lock, flags); +		list_del_init(&wait->task_list); +		spin_unlock_irqrestore(&q->lock, flags); +	} +} +EXPORT_SYMBOL(finish_wait); + +/** + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @mode: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and no one wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, +			unsigned int mode, void *key) +{ +	unsigned long flags; + +	__set_current_state(TASK_RUNNING); +	spin_lock_irqsave(&q->lock, flags); +	if (!list_empty(&wait->task_list)) +		list_del_init(&wait->task_list); +	else if (waitqueue_active(q)) +		__wake_up_locked_key(q, mode, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ +	int ret = default_wake_function(wait, mode, sync, key); + +	if (ret) +		list_del_init(&wait->task_list); +	return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ +	struct wait_bit_key *key = arg; +	struct wait_bit_queue *wait_bit +		= container_of(wait, struct wait_bit_queue, wait); + +	if (wait_bit->key.flags != key->flags || +			wait_bit->key.bit_nr != key->bit_nr || +			test_bit(key->bit_nr, key->flags)) +		return 0; +	else +		return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, +			int (*action)(void *), unsigned mode) +{ +	int ret = 0; + +	do { +		prepare_to_wait(wq, &q->wait, mode); +		if (test_bit(q->key.bit_nr, q->key.flags)) +			ret = (*action)(q->key.flags); +	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret); +	finish_wait(wq, &q->wait); +	return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, +					int (*action)(void *), unsigned mode) +{ +	wait_queue_head_t *wq = bit_waitqueue(word, bit); +	DEFINE_WAIT_BIT(wait, word, bit); + +	return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, +			int (*action)(void *), unsigned mode) +{ +	do { +		int ret; + +		prepare_to_wait_exclusive(wq, &q->wait, mode); +		if (!test_bit(q->key.bit_nr, q->key.flags)) +			continue; +		ret = action(q->key.flags); +		if (!ret) +			continue; +		abort_exclusive_wait(wq, &q->wait, mode, &q->key); +		return ret; +	} while (test_and_set_bit(q->key.bit_nr, q->key.flags)); +	finish_wait(wq, &q->wait); +	return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +					int (*action)(void *), unsigned mode) +{ +	wait_queue_head_t *wq = bit_waitqueue(word, bit); +	DEFINE_WAIT_BIT(wait, word, bit); + +	return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ +	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); +	if (waitqueue_active(wq)) +		__wake_up(wq, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_atomic(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ +	__wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ +	const int shift = BITS_PER_LONG == 32 ? 5 : 6; +	const struct zone *zone = page_zone(virt_to_page(word)); +	unsigned long val = (unsigned long)word << shift | bit; + +	return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ +	if (BITS_PER_LONG == 64) { +		unsigned long q = (unsigned long)p; +		return bit_waitqueue((void *)(q & ~1), q & 1); +	} +	return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, +				  void *arg) +{ +	struct wait_bit_key *key = arg; +	struct wait_bit_queue *wait_bit +		= container_of(wait, struct wait_bit_queue, wait); +	atomic_t *val = key->flags; + +	if (wait_bit->key.flags != key->flags || +	    wait_bit->key.bit_nr != key->bit_nr || +	    atomic_read(val) != 0) +		return 0; +	return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, +		       int (*action)(atomic_t *), unsigned mode) +{ +	atomic_t *val; +	int ret = 0; + +	do { +		prepare_to_wait(wq, &q->wait, mode); +		val = q->key.flags; +		if (atomic_read(val) == 0) +			break; +		ret = (*action)(val); +	} while (!ret && atomic_read(val) != 0); +	finish_wait(wq, &q->wait); +	return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p)					\ +	struct wait_bit_queue name = {					\ +		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\ +		.wait	= {						\ +			.private	= current,			\ +			.func		= wake_atomic_t_function,	\ +			.task_list	=				\ +				LIST_HEAD_INIT((name).wait.task_list),	\ +		},							\ +	} + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), +					 unsigned mode) +{ +	wait_queue_head_t *wq = atomic_t_waitqueue(p); +	DEFINE_WAIT_ATOMIC_T(wait, p); + +	return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ +	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t);  | 
