diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 765 | 
1 files changed, 124 insertions, 641 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e8b335016c5..05c39f03031 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)  #ifdef CONFIG_SCHED_HRTICK  /*   * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock.   */  static void hrtick_clear(struct rq *rq) @@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)  }  #ifdef CONFIG_SMP + +static int __hrtick_restart(struct rq *rq) +{ +	struct hrtimer *timer = &rq->hrtick_timer; +	ktime_t time = hrtimer_get_softexpires(timer); + +	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); +} +  /*   * called from hardirq (IPI) context   */ @@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)  	struct rq *rq = arg;  	raw_spin_lock(&rq->lock); -	hrtimer_restart(&rq->hrtick_timer); +	__hrtick_restart(rq);  	rq->hrtick_csd_pending = 0;  	raw_spin_unlock(&rq->lock);  } @@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)  	hrtimer_set_expires(timer, time);  	if (rq == this_rq()) { -		hrtimer_restart(timer); +		__hrtick_restart(rq);  	} else if (!rq->hrtick_csd_pending) {  		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);  		rq->hrtick_csd_pending = 1; @@ -679,7 +681,7 @@ void sched_avg_update(struct rq *rq)  {  	s64 period = sched_avg_period(); -	while ((s64)(rq->clock - rq->age_stamp) > period) { +	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {  		/*  		 * Inline assembly required to prevent the compiler  		 * optimising this loop into a divmod call. @@ -931,6 +933,8 @@ static int effective_prio(struct task_struct *p)  /**   * task_curr - is this task currently executing on a CPU?   * @p: the task in question. + * + * Return: 1 if the task is currently executing. 0 otherwise.   */  inline int task_curr(const struct task_struct *p)  { @@ -1340,7 +1344,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  		p->sched_class->task_woken(rq, p);  	if (rq->idle_stamp) { -		u64 delta = rq->clock - rq->idle_stamp; +		u64 delta = rq_clock(rq) - rq->idle_stamp;  		u64 max = 2*sysctl_sched_migration_cost;  		if (delta > max) @@ -1377,6 +1381,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)  	rq = __task_rq_lock(p);  	if (p->on_rq) { +		/* check_preempt_curr() may use rq clock */ +		update_rq_clock(rq);  		ttwu_do_wakeup(rq, p, wake_flags);  		ret = 1;  	} @@ -1478,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)   * the simpler "current->state = TASK_RUNNING" to mark yourself   * runnable without the overhead of this.   * - * Returns %true if @p was woken up, %false if it was already running + * Return: %true if @p was woken up, %false if it was already running.   * or @state didn't match @p's state.   */  static int @@ -1487,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	unsigned long flags;  	int cpu, success = 0; -	smp_wmb(); +	/* +	 * If we are going to wake up a thread waiting for CONDITION we +	 * need to ensure that CONDITION=1 done by the caller can not be +	 * reordered with p->state check below. This pairs with mb() in +	 * set_current_state() the waiting thread does. +	 */ +	smp_mb__before_spinlock();  	raw_spin_lock_irqsave(&p->pi_lock, flags);  	if (!(p->state & state))  		goto out; @@ -1573,8 +1585,9 @@ out:   * @p: The process to be woken up.   *   * Attempt to wake up the nominated process and move it to the set of runnable - * processes.  Returns 1 if the process was woken up, 0 if it was already - * running. + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running.   *   * It may be assumed that this function implies a write memory barrier before   * changing the task state if and only if any tasks are woken up. @@ -1609,15 +1622,6 @@ static void __sched_fork(struct task_struct *p)  	p->se.vruntime			= 0;  	INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) -	p->se.avg.runnable_avg_period = 0; -	p->se.avg.runnable_avg_sum = 0; -#endif  #ifdef CONFIG_SCHEDSTATS  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif @@ -1761,6 +1765,8 @@ void wake_up_new_task(struct task_struct *p)  	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));  #endif +	/* Initialize new task's runnable average */ +	init_task_runnable_average(p);  	rq = __task_rq_lock(p);  	activate_task(rq, p, 0);  	p->on_rq = 1; @@ -2069,575 +2075,6 @@ unsigned long nr_iowait_cpu(int cpu)  	return atomic_read(&this->nr_iowait);  } -unsigned long this_cpu_load(void) -{ -	struct rq *this = this_rq(); -	return this->cpu_load[0]; -} - - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - *   nr_active = 0; - *   for_each_possible_cpu(cpu) - *   	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - *  - for_each_possible_cpu() is prohibitively expensive on machines with - *    serious number of cpus, therefore we need to take a distributed approach - *    to calculating nr_active. - * - *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - *    So assuming nr_active := 0 when we start out -- true per definition, we - *    can simply take per-cpu deltas and fold those into a global accumulate - *    to obtain the same result. See calc_load_fold_active(). - * - *    Furthermore, in order to avoid synchronizing all per-cpu delta folding - *    across the machine, we assume 10 ticks is sufficient time for every - *    cpu to have completed this task. - * - *    This places an upper-bound on the IRQ-off latency of the machine. Then - *    again, being late doesn't loose the delta, just wrecks the sample. - * - *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - *    this would add another cross-cpu cacheline miss and atomic operation - *    to the wakeup path. Instead we increment on whatever cpu the task ran - *    when it went into uninterruptible state and decrement on whatever cpu - *    did the wakeup. This means that only the sum of nr_uninterruptible over - *    all cpus yields the correct result. - * - *  This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads:	pointer to dest load array - * @offset:	offset to add - * @shift:	shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ -	loads[0] = (avenrun[0] + offset) << shift; -	loads[1] = (avenrun[1] + offset) << shift; -	loads[2] = (avenrun[2] + offset) << shift; -} - -static long calc_load_fold_active(struct rq *this_rq) -{ -	long nr_active, delta = 0; - -	nr_active = this_rq->nr_running; -	nr_active += (long) this_rq->nr_uninterruptible; - -	if (nr_active != this_rq->calc_load_active) { -		delta = nr_active - this_rq->calc_load_active; -		this_rq->calc_load_active = nr_active; -	} - -	return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ -	load *= exp; -	load += active * (FIXED_1 - exp); -	load += 1UL << (FSHIFT - 1); -	return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - *  - When we go NO_HZ idle during the window, we can negate our sample - *    contribution, causing under-accounting. - * - *    We avoid this by keeping two idle-delta counters and flipping them - *    when the window starts, thus separating old and new NO_HZ load. - * - *    The only trick is the slight shift in index flip for read vs write. - * - *        0s            5s            10s           15s - *          +10           +10           +10           +10 - *        |-|-----------|-|-----------|-|-----------|-| - *    r:0 0 1           1 0           0 1           1 0 - *    w:0 1 1           0 0           1 1           0 0 - * - *    This ensures we'll fold the old idle contribution in this window while - *    accumlating the new one. - * - *  - When we wake up from NO_HZ idle during the window, we push up our - *    contribution, since we effectively move our sample point to a known - *    busy state. - * - *    This is solved by pushing the window forward, and thus skipping the - *    sample, for this cpu (effectively using the idle-delta for this cpu which - *    was in effect at the time the window opened). This also solves the issue - *    of having to deal with a cpu having been in NOHZ idle for multiple - *    LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ -	int idx = calc_load_idx; - -	/* -	 * See calc_global_nohz(), if we observe the new index, we also -	 * need to observe the new update time. -	 */ -	smp_rmb(); - -	/* -	 * If the folding window started, make sure we start writing in the -	 * next idle-delta. -	 */ -	if (!time_before(jiffies, calc_load_update)) -		idx++; - -	return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ -	return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ -	struct rq *this_rq = this_rq(); -	long delta; - -	/* -	 * We're going into NOHZ mode, if there's any pending delta, fold it -	 * into the pending idle delta. -	 */ -	delta = calc_load_fold_active(this_rq); -	if (delta) { -		int idx = calc_load_write_idx(); -		atomic_long_add(delta, &calc_load_idle[idx]); -	} -} - -void calc_load_exit_idle(void) -{ -	struct rq *this_rq = this_rq(); - -	/* -	 * If we're still before the sample window, we're done. -	 */ -	if (time_before(jiffies, this_rq->calc_load_update)) -		return; - -	/* -	 * We woke inside or after the sample window, this means we're already -	 * accounted through the nohz accounting, so skip the entire deal and -	 * sync up for the next window. -	 */ -	this_rq->calc_load_update = calc_load_update; -	if (time_before(jiffies, this_rq->calc_load_update + 10)) -		this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ -	int idx = calc_load_read_idx(); -	long delta = 0; - -	if (atomic_long_read(&calc_load_idle[idx])) -		delta = atomic_long_xchg(&calc_load_idle[idx], 0); - -	return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x:         base of the power - * @frac_bits: fractional bits of @x - * @n:         power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ -	unsigned long result = 1UL << frac_bits; - -	if (n) for (;;) { -		if (n & 1) { -			result *= x; -			result += 1UL << (frac_bits - 1); -			result >>= frac_bits; -		} -		n >>= 1; -		if (!n) -			break; -		x *= x; -		x += 1UL << (frac_bits - 1); -		x >>= frac_bits; -	} - -	return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - *    = (a0 * e + a * (1 - e)) * e + a * (1 - e) - *    = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - *  ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - *    = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - *              n         1 - x^(n+1) - *     S_n := \Sum x^i = ------------- - *             i=0          1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, -	    unsigned long active, unsigned int n) -{ - -	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ -	long delta, active, n; - -	if (!time_before(jiffies, calc_load_update + 10)) { -		/* -		 * Catch-up, fold however many we are behind still -		 */ -		delta = jiffies - calc_load_update - 10; -		n = 1 + (delta / LOAD_FREQ); - -		active = atomic_long_read(&calc_load_tasks); -		active = active > 0 ? active * FIXED_1 : 0; - -		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); -		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); -		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - -		calc_load_update += n * LOAD_FREQ; -	} - -	/* -	 * Flip the idle index... -	 * -	 * Make sure we first write the new time then flip the index, so that -	 * calc_load_write_idx() will see the new time when it reads the new -	 * index, this avoids a double flip messing things up. -	 */ -	smp_wmb(); -	calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ -	long active, delta; - -	if (time_before(jiffies, calc_load_update + 10)) -		return; - -	/* -	 * Fold the 'old' idle-delta to include all NO_HZ cpus. -	 */ -	delta = calc_load_fold_idle(); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks); - -	active = atomic_long_read(&calc_load_tasks); -	active = active > 0 ? active * FIXED_1 : 0; - -	avenrun[0] = calc_load(avenrun[0], EXP_1, active); -	avenrun[1] = calc_load(avenrun[1], EXP_5, active); -	avenrun[2] = calc_load(avenrun[2], EXP_15, active); - -	calc_load_update += LOAD_FREQ; - -	/* -	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. -	 */ -	calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ -	long delta; - -	if (time_before(jiffies, this_rq->calc_load_update)) -		return; - -	delta  = calc_load_fold_active(this_rq); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks); - -	this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT		7 -static const unsigned char -		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char -		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { -					{0, 0, 0, 0, 0, 0, 0, 0}, -					{64, 32, 8, 0, 0, 0, 0, 0}, -					{96, 72, 40, 12, 1, 0, 0}, -					{112, 98, 75, 43, 15, 1, 0}, -					{120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ -	int j = 0; - -	if (!missed_updates) -		return load; - -	if (missed_updates >= degrade_zero_ticks[idx]) -		return 0; - -	if (idx == 1) -		return load >> missed_updates; - -	while (missed_updates) { -		if (missed_updates % 2) -			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - -		missed_updates >>= 1; -		j++; -	} -	return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, -			      unsigned long pending_updates) -{ -	int i, scale; - -	this_rq->nr_load_updates++; - -	/* Update our load: */ -	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ -	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { -		unsigned long old_load, new_load; - -		/* scale is effectively 1 << i now, and >> i divides by scale */ - -		old_load = this_rq->cpu_load[i]; -		old_load = decay_load_missed(old_load, pending_updates - 1, i); -		new_load = this_load; -		/* -		 * Round up the averaging division if load is increasing. This -		 * prevents us from getting stuck on 9 if the load is 10, for -		 * example. -		 */ -		if (new_load > old_load) -			new_load += scale - 1; - -		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; -	} - -	sched_avg_update(this_rq); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ -	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); -	unsigned long load = this_rq->load.weight; -	unsigned long pending_updates; - -	/* -	 * bail if there's load or we're actually up-to-date. -	 */ -	if (load || curr_jiffies == this_rq->last_load_update_tick) -		return; - -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	this_rq->last_load_update_tick = curr_jiffies; - -	__update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ -	struct rq *this_rq = this_rq(); -	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); -	unsigned long pending_updates; - -	if (curr_jiffies == this_rq->last_load_update_tick) -		return; - -	raw_spin_lock(&this_rq->lock); -	pending_updates = curr_jiffies - this_rq->last_load_update_tick; -	if (pending_updates) { -		this_rq->last_load_update_tick = curr_jiffies; -		/* -		 * We were idle, this means load 0, the current load might be -		 * !0 due to remote wakeups and the sort. -		 */ -		__update_cpu_load(this_rq, 0, pending_updates); -	} -	raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Called from scheduler_tick() - */ -static void update_cpu_load_active(struct rq *this_rq) -{ -	/* -	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). -	 */ -	this_rq->last_load_update_tick = jiffies; -	__update_cpu_load(this_rq, this_rq->load.weight, 1); - -	calc_load_account_active(this_rq); -} -  #ifdef CONFIG_SMP  /* @@ -2686,7 +2123,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)  	if (task_current(rq, p)) {  		update_rq_clock(rq); -		ns = rq->clock_task - p->se.exec_start; +		ns = rq_clock_task(rq) - p->se.exec_start;  		if ((s64)ns < 0)  			ns = 0;  	} @@ -2739,8 +2176,8 @@ void scheduler_tick(void)  	raw_spin_lock(&rq->lock);  	update_rq_clock(rq); -	update_cpu_load_active(rq);  	curr->sched_class->task_tick(rq, curr, 0); +	update_cpu_load_active(rq);  	raw_spin_unlock(&rq->lock);  	perf_event_task_tick(); @@ -2763,6 +2200,8 @@ void scheduler_tick(void)   * This makes sure that uptime, CFS vruntime, load   * balancing, etc... continue to move forward, even   * with a very low granularity. + * + * Return: Maximum deferment in nanoseconds.   */  u64 scheduler_tick_max_deferment(void)  { @@ -2966,6 +2405,12 @@ need_resched:  	if (sched_feat(HRTICK))  		hrtick_clear(rq); +	/* +	 * Make sure that signal_pending_state()->signal_pending() below +	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) +	 * done by the caller to avoid the race with signal_wake_up(). +	 */ +	smp_mb__before_spinlock();  	raw_spin_lock_irq(&rq->lock);  	switch_count = &prev->nivcsw; @@ -3368,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion);   * specified timeout to expire. The timeout is in jiffies. It is not   * interruptible.   * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed.   */  unsigned long __sched  wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -3401,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io);   * specified timeout to expire. The timeout is in jiffies. It is not   * interruptible. The caller is accounted as waiting for IO.   * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed.   */  unsigned long __sched  wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) @@ -3418,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);   * This waits for completion of a specific task to be signaled. It is   * interruptible.   * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. + * Return: -ERESTARTSYS if interrupted, 0 if completed.   */  int __sched wait_for_completion_interruptible(struct completion *x)  { @@ -3437,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);   * This waits for either a completion of a specific task to be signaled or for a   * specified timeout to expire. It is interruptible. The timeout is in jiffies.   * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed.   */  long __sched  wait_for_completion_interruptible_timeout(struct completion *x, @@ -3455,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);   * This waits to be signaled for completion of a specific task. It can be   * interrupted by a kill signal.   * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. + * Return: -ERESTARTSYS if interrupted, 0 if completed.   */  int __sched wait_for_completion_killable(struct completion *x)  { @@ -3475,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);   * signaled or for a specified timeout to expire. It can be   * interrupted by a kill signal. The timeout is in jiffies.   * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed.   */  long __sched  wait_for_completion_killable_timeout(struct completion *x, @@ -3490,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);   *	try_wait_for_completion - try to decrement a completion without blocking   *	@x:	completion structure   * - *	Returns: 0 if a decrement cannot be done without blocking + *	Return: 0 if a decrement cannot be done without blocking   *		 1 if a decrement succeeded.   *   *	If a completion is being used as a counting completion, @@ -3517,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion);   *	completion_done - Test to see if a completion has any waiters   *	@x:	completion structure   * - *	Returns: 0 if there are waiters (wait_for_completion() in progress) + *	Return: 0 if there are waiters (wait_for_completion() in progress)   *		 1 if there are no waiters.   *   */ @@ -3754,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment)   * task_prio - return the priority value of a given task.   * @p: the task in question.   * - * This is the priority value as seen by users in /proc. + * Return: The priority value as seen by users in /proc.   * RT tasks are offset by -200. Normal tasks are centered   * around 0, value goes from -16 to +15.   */ @@ -3766,6 +3211,8 @@ int task_prio(const struct task_struct *p)  /**   * task_nice - return the nice value of a given task.   * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ].   */  int task_nice(const struct task_struct *p)  { @@ -3776,6 +3223,8 @@ EXPORT_SYMBOL(task_nice);  /**   * idle_cpu - is a given cpu idle currently?   * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise.   */  int idle_cpu(int cpu)  { @@ -3798,6 +3247,8 @@ int idle_cpu(int cpu)  /**   * idle_task - return the idle task for a given cpu.   * @cpu: the processor in question. + * + * Return: The idle task for the cpu @cpu.   */  struct task_struct *idle_task(int cpu)  { @@ -3807,6 +3258,8 @@ struct task_struct *idle_task(int cpu)  /**   * find_process_by_pid - find a process with a matching PID value.   * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise.   */  static struct task_struct *find_process_by_pid(pid_t pid)  { @@ -4004,6 +3457,8 @@ recheck:   * @policy: new policy.   * @param: structure containing the new RT priority.   * + * Return: 0 on success. An error code otherwise. + *   * NOTE that the task may be already dead.   */  int sched_setscheduler(struct task_struct *p, int policy, @@ -4023,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);   * current context has permission.  For example, this is needed in   * stop_machine(): we create temporary high priority worker threads,   * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise.   */  int sched_setscheduler_nocheck(struct task_struct *p, int policy,  			       const struct sched_param *param) @@ -4057,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)   * @pid: the pid in question.   * @policy: new policy.   * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise.   */  SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,  		struct sched_param __user *, param) @@ -4072,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,   * sys_sched_setparam - set/change the RT priority of a thread   * @pid: the pid in question.   * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise.   */  SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)  { @@ -4081,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)  /**   * sys_sched_getscheduler - get the policy (scheduling class) of a thread   * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code.   */  SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)  { @@ -4107,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)   * sys_sched_getparam - get the RT priority of a thread   * @pid: the pid in question.   * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code.   */  SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  { @@ -4231,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,   * @pid: pid of the process   * @len: length in bytes of the bitmask pointed to by user_mask_ptr   * @user_mask_ptr: user-space pointer to the new cpu mask + * + * Return: 0 on success. An error code otherwise.   */  SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,  		unsigned long __user *, user_mask_ptr) @@ -4282,6 +3751,8 @@ out_unlock:   * @pid: pid of the process   * @len: length in bytes of the bitmask pointed to by user_mask_ptr   * @user_mask_ptr: user-space pointer to hold the current cpu mask + * + * Return: 0 on success. An error code otherwise.   */  SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  		unsigned long __user *, user_mask_ptr) @@ -4316,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,   *   * This function yields the current CPU to other tasks. If there are no   * other threads running on this CPU then this function will return. + * + * Return: 0.   */  SYSCALL_DEFINE0(sched_yield)  { @@ -4441,7 +3914,7 @@ EXPORT_SYMBOL(yield);   * It's the caller's job to ensure that the target task struct   * can't go away on us before we can do any checks.   * - * Returns: + * Return:   *	true (>0) if we indeed boosted the target task.   *	false (0) if we failed to boost the target.   *	-ESRCH if there's no task to yield to. @@ -4544,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout)   * sys_sched_get_priority_max - return maximum RT priority.   * @policy: scheduling class.   * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned.   */  SYSCALL_DEFINE1(sched_get_priority_max, int, policy)  { @@ -4569,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)   * sys_sched_get_priority_min - return minimum RT priority.   * @policy: scheduling class.   * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned.   */  SYSCALL_DEFINE1(sched_get_priority_min, int, policy)  { @@ -4596,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)   *   * this syscall writes the default timeslice value of a given process   * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code.   */  SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  		struct timespec __user *, interval) @@ -4705,7 +4183,7 @@ void show_state_filter(unsigned long state_filter)  		debug_show_all_locks();  } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) +void init_idle_bootup_task(struct task_struct *idle)  {  	idle->sched_class = &idle_sched_class;  } @@ -4718,7 +4196,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)   * NOTE: this function does not set the idle thread's NEED_RESCHED   * flag, to make booting more robust.   */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; @@ -4960,6 +4438,13 @@ static void migrate_tasks(unsigned int dead_cpu)  	 */  	rq->stop = NULL; +	/* +	 * put_prev_task() and pick_next_task() sched +	 * class method both need to have an up-to-date +	 * value of rq->clock[_task] +	 */ +	update_rq_clock(rq); +  	for ( ; ; ) {  		/*  		 * There's this thread running, bail when that's the only @@ -5093,7 +4578,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)  	return table;  } -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)  {  	struct ctl_table *entry, *table;  	struct sched_domain *sd; @@ -5195,7 +4680,7 @@ static void set_rq_offline(struct rq *rq)   * migration_call - callback that gets triggered when a CPU is added.   * Here we can start up the necessary migration thread for the new CPU.   */ -static int __cpuinit +static int  migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  {  	int cpu = (long)hcpu; @@ -5249,12 +4734,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)   * happens before everything else.  This has to be lower priority than   * the notifier in the perf_event subsystem, though.   */ -static struct notifier_block __cpuinitdata migration_notifier = { +static struct notifier_block migration_notifier = {  	.notifier_call = migration_call,  	.priority = CPU_PRI_MIGRATION,  }; -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { @@ -5267,7 +4752,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,  	}  } -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +static int sched_cpu_inactive(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { @@ -5907,7 +5392,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)  	get_group(cpu, sdd, &sd->groups);  	atomic_inc(&sd->groups->ref); -	if (cpu != cpumask_first(sched_domain_span(sd))) +	if (cpu != cpumask_first(span))  		return 0;  	lockdep_assert_held(&sched_domains_mutex); @@ -5917,12 +5402,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)  	for_each_cpu(i, span) {  		struct sched_group *sg; -		int group = get_group(i, sdd, &sg); -		int j; +		int group, j;  		if (cpumask_test_cpu(i, covered))  			continue; +		group = get_group(i, sdd, &sg);  		cpumask_clear(sched_group_cpus(sg));  		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg)); @@ -5960,7 +5445,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  {  	struct sched_group *sg = sd->groups; -	WARN_ON(!sd || !sg); +	WARN_ON(!sg);  	do {  		sg->group_weight = cpumask_weight(sched_group_cpus(sg)); @@ -6125,6 +5610,9 @@ static struct sched_domain_topology_level default_topology[] = {  static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->init; tl++) +  #ifdef CONFIG_NUMA  static int sched_domains_numa_levels; @@ -6422,7 +5910,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  	struct sched_domain_topology_level *tl;  	int j; -	for (tl = sched_domain_topology; tl->init; tl++) { +	for_each_sd_topology(tl) {  		struct sd_data *sdd = &tl->data;  		sdd->sd = alloc_percpu(struct sched_domain *); @@ -6475,7 +5963,7 @@ static void __sdt_free(const struct cpumask *cpu_map)  	struct sched_domain_topology_level *tl;  	int j; -	for (tl = sched_domain_topology; tl->init; tl++) { +	for_each_sd_topology(tl) {  		struct sd_data *sdd = &tl->data;  		for_each_cpu(j, cpu_map) { @@ -6503,9 +5991,8 @@ static void __sdt_free(const struct cpumask *cpu_map)  }  struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, -		struct s_data *d, const struct cpumask *cpu_map, -		struct sched_domain_attr *attr, struct sched_domain *child, -		int cpu) +		const struct cpumask *cpu_map, struct sched_domain_attr *attr, +		struct sched_domain *child, int cpu)  {  	struct sched_domain *sd = tl->init(tl, cpu);  	if (!sd) @@ -6516,8 +6003,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		sd->level = child->level + 1;  		sched_domain_level_max = max(sched_domain_level_max, sd->level);  		child->parent = sd; +		sd->child = child;  	} -	sd->child = child;  	set_domain_attribute(sd, attr);  	return sd; @@ -6530,7 +6017,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  static int build_sched_domains(const struct cpumask *cpu_map,  			       struct sched_domain_attr *attr)  { -	enum s_alloc alloc_state = sa_none; +	enum s_alloc alloc_state;  	struct sched_domain *sd;  	struct s_data d;  	int i, ret = -ENOMEM; @@ -6544,18 +6031,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,  		struct sched_domain_topology_level *tl;  		sd = NULL; -		for (tl = sched_domain_topology; tl->init; tl++) { -			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); +		for_each_sd_topology(tl) { +			sd = build_sched_domain(tl, cpu_map, attr, sd, i); +			if (tl == sched_domain_topology) +				*per_cpu_ptr(d.sd, i) = sd;  			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))  				sd->flags |= SD_OVERLAP;  			if (cpumask_equal(cpu_map, sched_domain_span(sd)))  				break;  		} - -		while (sd->child) -			sd = sd->child; - -		*per_cpu_ptr(d.sd, i) = sd;  	}  	/* Build the groups for the domains */ @@ -6867,9 +6351,6 @@ void __init sched_init_smp(void)  	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);  	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); -	/* RT runtime code needs to handle some hotplug events */ -	hotcpu_notifier(update_runtime, 0); -  	init_hrtick();  	/* Move init over to a non-isolated CPU */ @@ -7201,6 +6682,8 @@ void normalize_rt_tasks(void)   * @cpu: the processor in question.   *   * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + * + * Return: The current task for @cpu.   */  struct task_struct *curr_task(int cpu)  { | 
