diff options
| author | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 15:06:28 +0900 | 
|---|---|---|
| committer | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 15:06:28 +0900 | 
| commit | f43dc23d5ea91fca257be02138a255f02d98e806 (patch) | |
| tree | b29722f6e965316e90ac97abf79923ced250dc21 /kernel/sched.c | |
| parent | f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff) | |
| parent | 4162cf64973df51fc885825bc9ca4d055891c49f (diff) | |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts:
	arch/sh/kernel/cpu/sh2/setup-sh7619.c
	arch/sh/kernel/cpu/sh2a/setup-mxg.c
	arch/sh/kernel/cpu/sh2a/setup-sh7201.c
	arch/sh/kernel/cpu/sh2a/setup-sh7203.c
	arch/sh/kernel/cpu/sh2a/setup-sh7206.c
	arch/sh/kernel/cpu/sh3/setup-sh7705.c
	arch/sh/kernel/cpu/sh3/setup-sh770x.c
	arch/sh/kernel/cpu/sh3/setup-sh7710.c
	arch/sh/kernel/cpu/sh3/setup-sh7720.c
	arch/sh/kernel/cpu/sh4/setup-sh4-202.c
	arch/sh/kernel/cpu/sh4/setup-sh7750.c
	arch/sh/kernel/cpu/sh4/setup-sh7760.c
	arch/sh/kernel/cpu/sh4a/setup-sh7343.c
	arch/sh/kernel/cpu/sh4a/setup-sh7366.c
	arch/sh/kernel/cpu/sh4a/setup-sh7722.c
	arch/sh/kernel/cpu/sh4a/setup-sh7723.c
	arch/sh/kernel/cpu/sh4a/setup-sh7724.c
	arch/sh/kernel/cpu/sh4a/setup-sh7763.c
	arch/sh/kernel/cpu/sh4a/setup-sh7770.c
	arch/sh/kernel/cpu/sh4a/setup-sh7780.c
	arch/sh/kernel/cpu/sh4a/setup-sh7785.c
	arch/sh/kernel/cpu/sh4a/setup-sh7786.c
	arch/sh/kernel/cpu/sh4a/setup-shx3.c
	arch/sh/kernel/cpu/sh5/setup-sh5.c
	drivers/serial/sh-sci.c
	drivers/serial/sh-sci.h
	include/linux/serial_sci.h
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 6016 | 
1 files changed, 2350 insertions, 3666 deletions
| diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..a0eb0941fa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@  #include <linux/completion.h>  #include <linux/kernel_stat.h>  #include <linux/debug_locks.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h>  #include <linux/security.h>  #include <linux/notifier.h>  #include <linux/profile.h> @@ -55,16 +55,15 @@  #include <linux/cpu.h>  #include <linux/cpuset.h>  #include <linux/percpu.h> -#include <linux/kthread.h>  #include <linux/proc_fs.h>  #include <linux/seq_file.h> +#include <linux/stop_machine.h>  #include <linux/sysctl.h>  #include <linux/syscalls.h>  #include <linux/times.h>  #include <linux/tsacct_kern.h>  #include <linux/kprobes.h>  #include <linux/delayacct.h> -#include <linux/reciprocal_div.h>  #include <linux/unistd.h>  #include <linux/pagemap.h>  #include <linux/hrtimer.h> @@ -72,11 +71,15 @@  #include <linux/debugfs.h>  #include <linux/ctype.h>  #include <linux/ftrace.h> +#include <linux/slab.h>  #include <asm/tlb.h>  #include <asm/irq_regs.h> +#include <asm/mutex.h>  #include "sched_cpupri.h" +#include "workqueue_sched.h" +#include "sched_autogroup.h"  #define CREATE_TRACE_POINTS  #include <trace/events/sched.h> @@ -120,30 +123,6 @@   */  #define RUNTIME_INF	((u64)~0ULL) -#ifdef CONFIG_SMP - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ -	return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ -	sg->__cpu_power += val; -	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif -  static inline int rt_policy(int policy)  {  	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -166,7 +145,7 @@ struct rt_prio_array {  struct rt_bandwidth {  	/* nests inside the rq lock: */ -	spinlock_t		rt_runtime_lock; +	raw_spinlock_t		rt_runtime_lock;  	ktime_t			rt_period;  	u64			rt_runtime;  	struct hrtimer		rt_period_timer; @@ -203,7 +182,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)  	rt_b->rt_period = ns_to_ktime(period);  	rt_b->rt_runtime = runtime; -	spin_lock_init(&rt_b->rt_runtime_lock); +	raw_spin_lock_init(&rt_b->rt_runtime_lock);  	hrtimer_init(&rt_b->rt_period_timer,  			CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -225,7 +204,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  	if (hrtimer_active(&rt_b->rt_period_timer))  		return; -	spin_lock(&rt_b->rt_runtime_lock); +	raw_spin_lock(&rt_b->rt_runtime_lock);  	for (;;) {  		unsigned long delta;  		ktime_t soft, hard; @@ -242,7 +221,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,  				HRTIMER_MODE_ABS_PINNED, 0);  	} -	spin_unlock(&rt_b->rt_runtime_lock); +	raw_spin_unlock(&rt_b->rt_runtime_lock);  }  #ifdef CONFIG_RT_GROUP_SCHED @@ -258,7 +237,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)   */  static DEFINE_MUTEX(sched_domains_mutex); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED  #include <linux/cgroup.h> @@ -268,13 +247,7 @@ static LIST_HEAD(task_groups);  /* task group related information */  struct task_group { -#ifdef CONFIG_CGROUP_SCHED  	struct cgroup_subsys_state css; -#endif - -#ifdef CONFIG_USER_SCHED -	uid_t uid; -#endif  #ifdef CONFIG_FAIR_GROUP_SCHED  	/* schedulable entities of this group on each cpu */ @@ -282,6 +255,8 @@ struct task_group {  	/* runqueue "owned" by this group on each cpu */  	struct cfs_rq **cfs_rq;  	unsigned long shares; + +	atomic_t load_weight;  #endif  #ifdef CONFIG_RT_GROUP_SCHED @@ -297,56 +272,18 @@ struct task_group {  	struct task_group *parent;  	struct list_head siblings;  	struct list_head children; -}; - -#ifdef CONFIG_USER_SCHED - -/* Helper function to pass uid information to create_sched_user() */ -void set_tg_uid(struct user_struct *user) -{ -	user->tg->uid = user->uid; -} -/* - * Root task group. - * 	Every UID task group (including init_task_group aka UID-0) will - * 	be a child to this group. - */ -struct task_group root_task_group; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Default task group's sched entity on each cpu */ -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); -/* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -#endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_USER_SCHED */ -#define root_task_group init_task_group -#endif /* CONFIG_USER_SCHED */ +#ifdef CONFIG_SCHED_AUTOGROUP +	struct autogroup *autogroup; +#endif +}; -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ +/* task_group_lock serializes the addition/removal of task groups */  static DEFINE_SPINLOCK(task_group_lock); -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ -	return list_empty(&root_task_group.children); -} -#endif -  #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_USER_SCHED -# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD) -#else /* !CONFIG_USER_SCHED */ -# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD -#endif /* CONFIG_USER_SCHED */ + +# define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD  /*   * A weight of 0 or 1 can cause arithmetics problems. @@ -359,62 +296,15 @@ static int root_task_group_empty(void)  #define MIN_SHARES	2  #define MAX_SHARES	(1UL << 18) -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +static int root_task_group_load = ROOT_TASK_GROUP_LOAD;  #endif  /* Default task group.   *	Every task in system belong to this group at bootup.   */ -struct task_group init_task_group; - -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ -	struct task_group *tg; - -#ifdef CONFIG_USER_SCHED -	rcu_read_lock(); -	tg = __task_cred(p)->user->tg; -	rcu_read_unlock(); -#elif defined(CONFIG_CGROUP_SCHED) -	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), -				struct task_group, css); -#else -	tg = &init_task_group; -#endif -	return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_FAIR_GROUP_SCHED -	p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; -	p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -	p->rt.rt_rq  = task_group(p)->rt_rq[cpu]; -	p->rt.parent = task_group(p)->rt_se[cpu]; -#endif -} - -#else - -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ -	return 1; -} -#endif - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ -	return NULL; -} +struct task_group root_task_group; -#endif	/* CONFIG_GROUP_SCHED */ +#endif	/* CONFIG_CGROUP_SCHED */  /* CFS-related fields in a runqueue */  struct cfs_rq { @@ -449,6 +339,7 @@ struct cfs_rq {  	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This  	 * list is used during load balance.  	 */ +	int on_list;  	struct list_head leaf_cfs_rq_list;  	struct task_group *tg;	/* group that "owns" this runqueue */ @@ -467,14 +358,17 @@ struct cfs_rq {  	unsigned long h_load;  	/* -	 * this cpu's part of tg->shares +	 * Maintaining per-cpu shares distribution for group scheduling +	 * +	 * load_stamp is the last time we updated the load average +	 * load_last is the last time we updated the load average and saw load +	 * load_unacc_exec_time is currently unaccounted execution time  	 */ -	unsigned long shares; +	u64 load_avg; +	u64 load_period; +	u64 load_stamp, load_last, load_unacc_exec_time; -	/* -	 * load.weight at the time we set shares -	 */ -	unsigned long rq_weight; +	unsigned long load_contribution;  #endif  #endif  }; @@ -493,6 +387,7 @@ struct rt_rq {  #endif  #ifdef CONFIG_SMP  	unsigned long rt_nr_migratory; +	unsigned long rt_nr_total;  	int overloaded;  	struct plist_head pushable_tasks;  #endif @@ -500,7 +395,7 @@ struct rt_rq {  	u64 rt_time;  	u64 rt_runtime;  	/* Nests inside the rq lock: */ -	spinlock_t rt_runtime_lock; +	raw_spinlock_t rt_runtime_lock;  #ifdef CONFIG_RT_GROUP_SCHED  	unsigned long rt_nr_boosted; @@ -508,7 +403,6 @@ struct rt_rq {  	struct rq *rq;  	struct list_head leaf_rt_rq_list;  	struct task_group *tg; -	struct sched_rt_entity *rt_se;  #endif  }; @@ -533,17 +427,7 @@ struct root_domain {  	 */  	cpumask_var_t rto_mask;  	atomic_t rto_count; -#ifdef CONFIG_SMP  	struct cpupri cpupri; -#endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	/* -	 * Preferred wake up cpu nominated by sched_mc balance that will be -	 * used when most cpus are idle in the system indicating overall very -	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) -	 */ -	unsigned int sched_mc_preferred_wakeup_cpu; -#endif  };  /* @@ -552,7 +436,7 @@ struct root_domain {   */  static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */  /*   * This is the main, per-CPU runqueue data structure. @@ -563,7 +447,7 @@ static struct root_domain def_root_domain;   */  struct rq {  	/* runqueue lock: */ -	spinlock_t lock; +	raw_spinlock_t lock;  	/*  	 * nr_running and cpu_load should be in the same cacheline because @@ -572,15 +456,17 @@ struct rq {  	unsigned long nr_running;  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX]; +	unsigned long last_load_update_tick;  #ifdef CONFIG_NO_HZ -	unsigned long last_tick_seen; -	unsigned char in_nohz_recently; +	u64 nohz_stamp; +	unsigned char nohz_balance_kick;  #endif +	unsigned int skip_clock_update; +  	/* capture load from *all* tasks on this cpu: */  	struct load_weight load;  	unsigned long nr_load_updates;  	u64 nr_switches; -	u64 nr_migrations_in;  	struct cfs_rq cfs;  	struct rt_rq rt; @@ -601,11 +487,12 @@ struct rq {  	 */  	unsigned long nr_uninterruptible; -	struct task_struct *curr, *idle; +	struct task_struct *curr, *idle, *stop;  	unsigned long next_balance;  	struct mm_struct *prev_mm;  	u64 clock; +	u64 clock_task;  	atomic_t nr_iowait; @@ -613,18 +500,28 @@ struct rq {  	struct root_domain *rd;  	struct sched_domain *sd; +	unsigned long cpu_power; +  	unsigned char idle_at_tick;  	/* For active balancing */ +	int post_schedule;  	int active_balance;  	int push_cpu; +	struct cpu_stop_work active_balance_work;  	/* cpu of this runqueue: */  	int cpu;  	int online;  	unsigned long avg_load_per_task; -	struct task_struct *migration_thread; -	struct list_head migration_queue; +	u64 rt_avg; +	u64 age_stamp; +	u64 idle_stamp; +	u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +	u64 prev_irq_time;  #endif  	/* calc_load related fields */ @@ -664,10 +561,8 @@ struct rq {  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) -{ -	rq->curr->sched_class->check_preempt_curr(rq, p, sync); -} + +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);  static inline int cpu_of(struct rq *rq)  { @@ -678,6 +573,11 @@ static inline int cpu_of(struct rq *rq)  #endif  } +#define rcu_dereference_check_sched_domain(p) \ +	rcu_dereference_check((p), \ +			      rcu_read_lock_sched_held() || \ +			      lockdep_is_held(&sched_domains_mutex)) +  /*   * The domain tree (rq->sd) is protected by RCU's quiescent state transition.   * See detach_destroy_domains: synchronize_sched for details. @@ -686,16 +586,72 @@ static inline int cpu_of(struct rq *rq)   * preempt-disabled sections.   */  #define for_each_domain(cpu, __sd) \ -	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) +	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)  #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))  #define this_rq()		(&__get_cpu_var(runqueues))  #define task_rq(p)		cpu_rq(task_cpu(p))  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr) +#define raw_rq()		(&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_CGROUP_SCHED -inline void update_rq_clock(struct rq *rq) +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification + * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * holds that lock for each task it moves into the cgroup. Therefore + * by holding that lock, we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p)  { -	rq->clock = sched_clock_cpu(cpu_of(rq)); +	struct task_group *tg; +	struct cgroup_subsys_state *css; + +	css = task_subsys_state_check(p, cpu_cgroup_subsys_id, +			lockdep_is_held(&task_rq(p)->lock)); +	tg = container_of(css, struct task_group, css); + +	return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED +	p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; +	p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +	p->rt.rt_rq  = task_group(p)->rt_rq[cpu]; +	p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ +	return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +static void update_rq_clock(struct rq *rq) +{ +	s64 delta; + +	if (rq->skip_clock_update) +		return; + +	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; +	rq->clock += delta; +	update_rq_clock_task(rq, delta);  }  /* @@ -709,20 +665,15 @@ inline void update_rq_clock(struct rq *rq)  /**   * runqueue_is_locked + * @cpu: the processor in question.   *   * Returns true if the current cpu runqueue is locked.   * This interface allows printk to be called with the runqueue lock   * held and know whether or not it is OK to wake up the klogd.   */ -int runqueue_is_locked(void) +int runqueue_is_locked(int cpu)  { -	int cpu = get_cpu(); -	struct rq *rq = cpu_rq(cpu); -	int ret; - -	ret = spin_is_locked(&rq->lock); -	put_cpu(); -	return ret; +	return raw_spin_is_locked(&cpu_rq(cpu)->lock);  }  /* @@ -777,7 +728,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  		size_t cnt, loff_t *ppos)  {  	char buf[64]; -	char *cmp = buf; +	char *cmp;  	int neg = 0;  	int i; @@ -788,16 +739,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  		return -EFAULT;  	buf[cnt] = 0; +	cmp = strstrip(buf); -	if (strncmp(buf, "NO_", 3) == 0) { +	if (strncmp(cmp, "NO_", 3) == 0) {  		neg = 1;  		cmp += 3;  	}  	for (i = 0; sched_feat_names[i]; i++) { -		int len = strlen(sched_feat_names[i]); - -		if (strncmp(cmp, sched_feat_names[i], len) == 0) { +		if (strcmp(cmp, sched_feat_names[i]) == 0) {  			if (neg)  				sysctl_sched_features &= ~(1UL << i);  			else @@ -809,7 +759,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  	if (!sched_feat_names[i])  		return -EINVAL; -	filp->f_pos += cnt; +	*ppos += cnt;  	return cnt;  } @@ -819,7 +769,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)  	return single_open(filp, sched_feat_show, NULL);  } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = {  	.open		= sched_feat_open,  	.write		= sched_feat_write,  	.read		= seq_read, @@ -847,17 +797,12 @@ late_initcall(sched_init_debug);  const_debug unsigned int sysctl_sched_nr_migrate = 32;  /* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s   */ -unsigned int sysctl_sched_shares_thresh = 4; +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;  /*   * period over which we measure -rt task cpu usage in us. @@ -921,7 +866,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  	 */  	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -	spin_unlock_irq(&rq->lock); +	raw_spin_unlock_irq(&rq->lock);  }  #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -945,9 +890,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)  	next->oncpu = 1;  #endif  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -	spin_unlock_irq(&rq->lock); +	raw_spin_unlock_irq(&rq->lock);  #else -	spin_unlock(&rq->lock); +	raw_spin_unlock(&rq->lock);  #endif  } @@ -969,18 +914,29 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */  /* + * Check whether the task is waking, we use this to synchronize ->cpus_allowed + * against ttwu(). + */ +static inline int task_is_waking(struct task_struct *p) +{ +	return unlikely(p->state == TASK_WAKING); +} + +/*   * __task_rq_lock - lock the runqueue a given task resides on.   * Must be called interrupts disabled.   */  static inline struct rq *__task_rq_lock(struct task_struct *p)  	__acquires(rq->lock)  { +	struct rq *rq; +  	for (;;) { -		struct rq *rq = task_rq(p); -		spin_lock(&rq->lock); +		rq = task_rq(p); +		raw_spin_lock(&rq->lock);  		if (likely(rq == task_rq(p)))  			return rq; -		spin_unlock(&rq->lock); +		raw_spin_unlock(&rq->lock);  	}  } @@ -997,31 +953,23 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)  	for (;;) {  		local_irq_save(*flags);  		rq = task_rq(p); -		spin_lock(&rq->lock); +		raw_spin_lock(&rq->lock);  		if (likely(rq == task_rq(p)))  			return rq; -		spin_unlock_irqrestore(&rq->lock, *flags); +		raw_spin_unlock_irqrestore(&rq->lock, *flags);  	}  } -void task_rq_unlock_wait(struct task_struct *p) -{ -	struct rq *rq = task_rq(p); - -	smp_mb(); /* spin-unlock-wait is not a full memory barrier */ -	spin_unlock_wait(&rq->lock); -} -  static void __task_rq_unlock(struct rq *rq)  	__releases(rq->lock)  { -	spin_unlock(&rq->lock); +	raw_spin_unlock(&rq->lock);  }  static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)  	__releases(rq->lock)  { -	spin_unlock_irqrestore(&rq->lock, *flags); +	raw_spin_unlock_irqrestore(&rq->lock, *flags);  }  /* @@ -1034,7 +982,7 @@ static struct rq *this_rq_lock(void)  	local_irq_disable();  	rq = this_rq(); -	spin_lock(&rq->lock); +	raw_spin_lock(&rq->lock);  	return rq;  } @@ -1081,10 +1029,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)  	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -	spin_lock(&rq->lock); +	raw_spin_lock(&rq->lock);  	update_rq_clock(rq);  	rq->curr->sched_class->task_tick(rq, rq->curr, 1); -	spin_unlock(&rq->lock); +	raw_spin_unlock(&rq->lock);  	return HRTIMER_NORESTART;  } @@ -1097,10 +1045,10 @@ static void __hrtick_start(void *arg)  {  	struct rq *rq = arg; -	spin_lock(&rq->lock); +	raw_spin_lock(&rq->lock);  	hrtimer_restart(&rq->hrtick_timer);  	rq->hrtick_csd_pending = 0; -	spin_unlock(&rq->lock); +	raw_spin_unlock(&rq->lock);  }  /* @@ -1207,7 +1155,7 @@ static void resched_task(struct task_struct *p)  {  	int cpu; -	assert_spin_locked(&task_rq(p)->lock); +	assert_raw_spin_locked(&task_rq(p)->lock);  	if (test_tsk_need_resched(p))  		return; @@ -1229,14 +1177,35 @@ static void resched_cpu(int cpu)  	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; -	if (!spin_trylock_irqsave(&rq->lock, flags)) +	if (!raw_spin_trylock_irqsave(&rq->lock, flags))  		return;  	resched_task(cpu_curr(cpu)); -	spin_unlock_irqrestore(&rq->lock, flags); +	raw_spin_unlock_irqrestore(&rq->lock, flags);  }  #ifdef CONFIG_NO_HZ  /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu.  This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ +	int cpu = smp_processor_id(); +	int i; +	struct sched_domain *sd; + +	for_each_domain(cpu, sd) { +		for_each_cpu(i, sched_domain_span(sd)) +			if (!idle_cpu(i)) +				return i; +	} +	return cpu; +} +/*   * When add_timer_on() enqueues a timer into the timer wheel of an   * idle CPU then this timer might expire before the next timer event   * which is scheduled to wake up that CPU. In case of a completely @@ -1275,14 +1244,50 @@ void wake_up_idle_cpu(int cpu)  	if (!tsk_is_polling(rq->idle))  		smp_send_reschedule(cpu);  } +  #endif /* CONFIG_NO_HZ */ +static u64 sched_avg_period(void) +{ +	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ +	s64 period = sched_avg_period(); + +	while ((s64)(rq->clock - rq->age_stamp) > period) { +		/* +		 * Inline assembly required to prevent the compiler +		 * optimising this loop into a divmod call. +		 * See __iter_div_u64_rem() for another example of this. +		 */ +		asm("" : "+rm" (rq->age_stamp)); +		rq->age_stamp += period; +		rq->rt_avg /= 2; +	} +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +	rq->rt_avg += rt_delta; +	sched_avg_update(rq); +} +  #else /* !CONFIG_SMP */  static void resched_task(struct task_struct *p)  { -	assert_spin_locked(&task_rq(p)->lock); +	assert_raw_spin_locked(&task_rq(p)->lock);  	set_tsk_need_resched(p);  } + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} + +static void sched_avg_update(struct rq *rq) +{ +}  #endif /* CONFIG_SMP */  #if BITS_PER_LONG == 32 @@ -1340,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)  	lw->inv_weight = 0;  } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ +	lw->weight = w; +	lw->inv_weight = 0; +} +  /*   * To aid in avoiding the subversion of "niceness" due to uneven distribution   * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1393,32 +1404,6 @@ static const u32 prio_to_wmult[40] = {   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,  }; -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { -	void *arg; -	struct task_struct *(*start)(void *); -	struct task_struct *(*next)(void *); -}; - -#ifdef CONFIG_SMP -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, -	      unsigned long max_load_move, struct sched_domain *sd, -	      enum cpu_idle_type idle, int *all_pinned, -	      int *this_best_prio, struct rq_iterator *iterator); - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, -		   struct sched_domain *sd, enum cpu_idle_type idle, -		   struct rq_iterator *iterator); -#endif -  /* Time spent by the tasks of the cpu accounting group executing in ... */  enum cpuacct_stat_index {  	CPUACCT_STAT_USER,	/* ... user mode */ @@ -1493,103 +1478,67 @@ static int tg_nop(struct task_group *tg, void *data)  #endif  #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu)  { -	struct rq *rq = cpu_rq(cpu); -	unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - -	if (nr_running) -		rq->avg_load_per_task = rq->load.weight / nr_running; -	else -		rq->avg_load_per_task = 0; - -	return rq->avg_load_per_task; +	return cpu_rq(cpu)->load.weight;  } -#ifdef CONFIG_FAIR_GROUP_SCHED - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); -  /* - * Calculate and set the cpu's group shares. + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively.   */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, -			unsigned long sd_shares, unsigned long sd_rq_weight) +static unsigned long source_load(int cpu, int type)  { -	unsigned long shares; -	unsigned long rq_weight; - -	if (!tg->se[cpu]) -		return; - -	rq_weight = tg->cfs_rq[cpu]->rq_weight; - -	/* -	 *           \Sum shares * rq_weight -	 * shares =  ----------------------- -	 *               \Sum rq_weight -	 * -	 */ -	shares = (sd_shares * rq_weight) / sd_rq_weight; -	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - -	if (abs(shares - tg->se[cpu]->load.weight) > -			sysctl_sched_shares_thresh) { -		struct rq *rq = cpu_rq(cpu); -		unsigned long flags; +	struct rq *rq = cpu_rq(cpu); +	unsigned long total = weighted_cpuload(cpu); -		spin_lock_irqsave(&rq->lock, flags); -		tg->cfs_rq[cpu]->shares = shares; +	if (type == 0 || !sched_feat(LB_BIAS)) +		return total; -		__set_se_shares(tg->se[cpu], shares); -		spin_unlock_irqrestore(&rq->lock, flags); -	} +	return min(rq->cpu_load[type-1], total);  }  /* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value.   */ -static int tg_shares_up(struct task_group *tg, void *data) +static unsigned long target_load(int cpu, int type)  { -	unsigned long weight, rq_weight = 0; -	unsigned long shares = 0; -	struct sched_domain *sd = data; -	int i; +	struct rq *rq = cpu_rq(cpu); +	unsigned long total = weighted_cpuload(cpu); -	for_each_cpu(i, sched_domain_span(sd)) { -		/* -		 * If there are currently no tasks on the cpu pretend there -		 * is one of average load so that when a new task gets to -		 * run here it will not get delayed by group starvation. -		 */ -		weight = tg->cfs_rq[i]->load.weight; -		if (!weight) -			weight = NICE_0_LOAD; +	if (type == 0 || !sched_feat(LB_BIAS)) +		return total; -		tg->cfs_rq[i]->rq_weight = weight; -		rq_weight += weight; -		shares += tg->cfs_rq[i]->shares; -	} +	return max(rq->cpu_load[type-1], total); +} -	if ((!shares && rq_weight) || shares > tg->shares) -		shares = tg->shares; +static unsigned long power_of(int cpu) +{ +	return cpu_rq(cpu)->cpu_power; +} -	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) -		shares = tg->shares; +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -	for_each_cpu(i, sched_domain_span(sd)) -		update_group_shares_cpu(tg, i, shares, rq_weight); +static unsigned long cpu_avg_load_per_task(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long nr_running = ACCESS_ONCE(rq->nr_running); -	return 0; +	if (nr_running) +		rq->avg_load_per_task = rq->load.weight / nr_running; +	else +		rq->avg_load_per_task = 0; + +	return rq->avg_load_per_task;  } +#ifdef CONFIG_FAIR_GROUP_SCHED +  /*   * Compute the cpu's hierarchical load factor for each task group.   * This needs to be done in a top-down fashion because the load of a child @@ -1604,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)  		load = cpu_rq(cpu)->load.weight;  	} else {  		load = tg->parent->cfs_rq[cpu]->h_load; -		load *= tg->cfs_rq[cpu]->shares; +		load *= tg->se[cpu]->load.weight;  		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;  	} @@ -1613,43 +1562,17 @@ static int tg_load_down(struct task_group *tg, void *data)  	return 0;  } -static void update_shares(struct sched_domain *sd) -{ -	u64 now = cpu_clock(raw_smp_processor_id()); -	s64 elapsed = now - sd->last_update; - -	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { -		sd->last_update = now; -		walk_tg_tree(tg_nop, tg_shares_up, sd); -	} -} - -static void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ -	spin_unlock(&rq->lock); -	update_shares(sd); -	spin_lock(&rq->lock); -} -  static void update_h_load(long cpu)  {  	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);  } -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - -static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ -} -  #endif  #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); +  /*   * fair double_lock_balance: Safely acquires both rq->locks in a fair   * way at the expense of forcing extra atomic operations in all @@ -1663,7 +1586,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)  	__acquires(busiest->lock)  	__acquires(this_rq->lock)  { -	spin_unlock(&this_rq->lock); +	raw_spin_unlock(&this_rq->lock);  	double_rq_lock(this_rq, busiest);  	return 1; @@ -1684,14 +1607,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)  {  	int ret = 0; -	if (unlikely(!spin_trylock(&busiest->lock))) { +	if (unlikely(!raw_spin_trylock(&busiest->lock))) {  		if (busiest < this_rq) { -			spin_unlock(&this_rq->lock); -			spin_lock(&busiest->lock); -			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); +			raw_spin_unlock(&this_rq->lock); +			raw_spin_lock(&busiest->lock); +			raw_spin_lock_nested(&this_rq->lock, +					      SINGLE_DEPTH_NESTING);  			ret = 1;  		} else -			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); +			raw_spin_lock_nested(&busiest->lock, +					      SINGLE_DEPTH_NESTING);  	}  	return ret;  } @@ -1705,7 +1630,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)  {  	if (unlikely(!irqs_disabled())) {  		/* printk() doesn't work good under rq->lock */ -		spin_unlock(&this_rq->lock); +		raw_spin_unlock(&this_rq->lock);  		BUG_ON(1);  	} @@ -1715,34 +1640,81 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)  	__releases(busiest->lock)  { -	spin_unlock(&busiest->lock); +	raw_spin_unlock(&busiest->lock);  	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);  } -#endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) +	__acquires(rq1->lock) +	__acquires(rq2->lock)  { -#ifdef CONFIG_SMP -	cfs_rq->shares = shares; -#endif +	BUG_ON(!irqs_disabled()); +	if (rq1 == rq2) { +		raw_spin_lock(&rq1->lock); +		__acquire(rq2->lock);	/* Fake it out ;) */ +	} else { +		if (rq1 < rq2) { +			raw_spin_lock(&rq1->lock); +			raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); +		} else { +			raw_spin_lock(&rq2->lock); +			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); +		} +	} +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) +	__releases(rq1->lock) +	__releases(rq2->lock) +{ +	raw_spin_unlock(&rq1->lock); +	if (rq1 != rq2) +		raw_spin_unlock(&rq2->lock); +	else +		__release(rq2->lock);  } +  #endif -static void calc_load_account_active(struct rq *this_rq); +static void calc_load_account_idle(struct rq *this_rq); +static void update_sysctl(void); +static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); -#include "sched_stats.h" -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +	set_task_rq(p, cpu); +#ifdef CONFIG_SMP +	/* +	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be +	 * successfuly executed on another CPU. We must ensure that updates of +	 * per-task data have been completed by this moment. +	 */ +	smp_wmb(); +	task_thread_info(p)->cpu = cpu;  #endif +} -#define sched_class_highest (&rt_sched_class) +static const struct sched_class rt_sched_class; + +#define sched_class_highest (&stop_sched_class)  #define for_each_class(class) \     for (class = sched_class_highest; class; class = class->next) +#include "sched_stats.h" +  static void inc_nr_running(struct rq *rq)  {  	rq->nr_running++; @@ -1755,12 +1727,6 @@ static void dec_nr_running(struct rq *rq)  static void set_load_weight(struct task_struct *p)  { -	if (task_has_rt_policy(p)) { -		p->se.load.weight = prio_to_weight[0] * 2; -		p->se.load.inv_weight = prio_to_wmult[0] >> 1; -		return; -	} -  	/*  	 * SCHED_IDLE tasks get minimal weight:  	 */ @@ -1774,38 +1740,232 @@ static void set_load_weight(struct task_struct *p)  	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];  } -static void update_avg(u64 *avg, u64 sample) +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)  { -	s64 diff = sample - *avg; -	*avg += diff >> 3; +	update_rq_clock(rq); +	sched_info_queued(p); +	p->sched_class->enqueue_task(rq, p, flags); +	p->se.on_rq = 1;  } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)  { -	if (wakeup) -		p->se.start_runtime = p->se.sum_exec_runtime; +	update_rq_clock(rq); +	sched_info_dequeued(p); +	p->sched_class->dequeue_task(rq, p, flags); +	p->se.on_rq = 0; +} -	sched_info_queued(p); -	p->sched_class->enqueue_task(rq, p, wakeup); -	p->se.on_rq = 1; +/* + * activate_task - move a task to the runqueue. + */ +static void activate_task(struct rq *rq, struct task_struct *p, int flags) +{ +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible--; + +	enqueue_task(rq, p, flags); +	inc_nr_running(rq);  } -static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)  { -	if (sleep) { -		if (p->se.last_wakeup) { -			update_avg(&p->se.avg_overlap, -				p->se.sum_exec_runtime - p->se.last_wakeup); -			p->se.last_wakeup = 0; -		} else { -			update_avg(&p->se.avg_wakeup, -				sysctl_sched_wakeup_granularity); -		} +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible++; + +	dequeue_task(rq, p, flags); +	dec_nr_running(rq); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ +	__this_cpu_inc(irq_time_seq.sequence); +	smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ +	smp_wmb(); +	__this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ +	u64 irq_time; +	unsigned seq; + +	do { +		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); +		irq_time = per_cpu(cpu_softirq_time, cpu) + +			   per_cpu(cpu_hardirq_time, cpu); +	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + +	return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ +	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ +	unsigned long flags; +	s64 delta; +	int cpu; + +	if (!sched_clock_irqtime) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); +	__this_cpu_add(irq_start_time, delta); + +	irq_time_write_begin(); +	/* +	 * We do not account for softirq time from ksoftirqd here. +	 * We want to continue accounting softirq time to ksoftirqd thread +	 * in that case, so as not to confuse scheduler with a special task +	 * that do not consume any time, but still wants to run. +	 */ +	if (hardirq_count()) +		__this_cpu_add(cpu_hardirq_time, delta); +	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) +		__this_cpu_add(cpu_softirq_time, delta); + +	irq_time_write_end(); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +	s64 irq_delta; + +	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + +	/* +	 * Since irq_time is only updated on {soft,}irq_exit, we might run into +	 * this case when a previous update_rq_clock() happened inside a +	 * {soft,}irq region. +	 * +	 * When this happens, we stop ->clock_task and only update the +	 * prev_irq_time stamp to account for the part that fit, so that a next +	 * update will consume the rest. This ensures ->clock_task is +	 * monotonic. +	 * +	 * It does however cause some slight miss-attribution of {soft,}irq +	 * time, a more accurate solution would be to update the irq_time using +	 * the current rq->clock timestamp, except that would require using +	 * atomic ops. +	 */ +	if (irq_delta > delta) +		irq_delta = delta; + +	rq->prev_irq_time += irq_delta; +	delta -= irq_delta; +	rq->clock_task += delta; + +	if (irq_delta && sched_feat(NONIRQ_POWER)) +		sched_rt_avg_update(rq, irq_delta); +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +	rq->clock_task += delta; +} + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#include "sched_autogroup.c" +#include "sched_stoptask.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ +	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; +	struct task_struct *old_stop = cpu_rq(cpu)->stop; + +	if (stop) { +		/* +		 * Make it appear like a SCHED_FIFO task, its something +		 * userspace knows about and won't get confused about. +		 * +		 * Also, it will make PI more or less work without too +		 * much confusion -- but then, stop work should not +		 * rely on PI working anyway. +		 */ +		sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + +		stop->sched_class = &stop_sched_class;  	} -	sched_info_dequeued(p); -	p->sched_class->dequeue_task(rq, p, sleep); -	p->se.on_rq = 0; +	cpu_rq(cpu)->stop = stop; + +	if (old_stop) { +		/* +		 * Reset it back to a normal scheduling class so that +		 * it can die in pieces. +		 */ +		old_stop->sched_class = &rt_sched_class; +	}  }  /* @@ -1854,30 +2014,6 @@ static int effective_prio(struct task_struct *p)  	return p->prio;  } -/* - * activate_task - move a task to the runqueue. - */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) -{ -	if (task_contributes_to_load(p)) -		rq->nr_uninterruptible--; - -	enqueue_task(rq, p, wakeup); -	inc_nr_running(rq); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) -{ -	if (task_contributes_to_load(p)) -		rq->nr_uninterruptible++; - -	dequeue_task(rq, p, sleep); -	dec_nr_running(rq); -} -  /**   * task_curr - is this task currently executing on a CPU?   * @p: the task in question. @@ -1887,20 +2023,6 @@ inline int task_curr(const struct task_struct *p)  	return cpu_curr(task_cpu(p)) == p;  } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -	set_task_rq(p, cpu); -#ifdef CONFIG_SMP -	/* -	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -	 * successfuly executed on another CPU. We must ensure that updates of -	 * per-task data have been completed by this moment. -	 */ -	smp_wmb(); -	task_thread_info(p)->cpu = cpu; -#endif -} -  static inline void check_class_changed(struct rq *rq, struct task_struct *p,  				       const struct sched_class *prev_class,  				       int oldprio, int running) @@ -1913,14 +2035,32 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,  		p->sched_class->prio_changed(rq, p, oldprio, running);  } -#ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)  { -	return cpu_rq(cpu)->load.weight; +	const struct sched_class *class; + +	if (p->sched_class == rq->curr->sched_class) { +		rq->curr->sched_class->check_preempt_curr(rq, p, flags); +	} else { +		for_each_class(class) { +			if (class == rq->curr->sched_class) +				break; +			if (class == p->sched_class) { +				resched_task(rq->curr); +				break; +			} +		} +	} + +	/* +	 * A queue event has occurred, and we're going to schedule.  In +	 * this case, we can save a useless back to back clock update. +	 */ +	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) +		rq->skip_clock_update = 1;  } +#ifdef CONFIG_SMP  /*   * Is this task likely cache-hot:   */ @@ -1929,17 +2069,20 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  {  	s64 delta; +	if (p->sched_class != &fair_sched_class) +		return 0; + +	if (unlikely(p->policy == SCHED_IDLE)) +		return 0; +  	/*  	 * Buddy candidates are cache hot:  	 */ -	if (sched_feat(CACHE_HOT_BUDDY) && +	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&  			(&p->se == cfs_rq_of(&p->se)->next ||  			 &p->se == cfs_rq_of(&p->se)->last))  		return 1; -	if (p->sched_class != &fair_sched_class) -		return 0; -  	if (sysctl_sched_migration_cost == -1)  		return 1;  	if (sysctl_sched_migration_cost == 0) @@ -1950,119 +2093,45 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  	return delta < (s64)sysctl_sched_migration_cost;  } -  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  { -	int old_cpu = task_cpu(p); -	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); -	struct cfs_rq *old_cfsrq = task_cfs_rq(p), -		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); -	u64 clock_offset; - -	clock_offset = old_rq->clock - new_rq->clock; +#ifdef CONFIG_SCHED_DEBUG +	/* +	 * We should never call set_task_cpu() on a blocked task, +	 * ttwu() will sort out the placement. +	 */ +	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && +			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +#endif  	trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS -	if (p->se.wait_start) -		p->se.wait_start -= clock_offset; -	if (p->se.sleep_start) -		p->se.sleep_start -= clock_offset; -	if (p->se.block_start) -		p->se.block_start -= clock_offset; -#endif -	if (old_cpu != new_cpu) { +	if (task_cpu(p) != new_cpu) {  		p->se.nr_migrations++; -		new_rq->nr_migrations_in++; -#ifdef CONFIG_SCHEDSTATS -		if (task_hot(p, old_rq->clock, NULL)) -			schedstat_inc(p, se.nr_forced2_migrations); -#endif -		perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, -				     1, 1, NULL, 0); +		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);  	} -	p->se.vruntime -= old_cfsrq->min_vruntime - -					 new_cfsrq->min_vruntime;  	__set_task_cpu(p, new_cpu);  } -struct migration_req { -	struct list_head list; - +struct migration_arg {  	struct task_struct *task;  	int dest_cpu; - -	struct completion done;  }; +static int migration_cpu_stop(void *data); +  /*   * The task's runqueue lock must be held.   * Returns true if you have to wait for migration thread.   */ -static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +static bool migrate_task(struct task_struct *p, struct rq *rq)  { -	struct rq *rq = task_rq(p); -  	/*  	 * If the task is not on a runqueue (and not running), then -	 * it is sufficient to simply update the task's cpu field. +	 * the next wake-up will properly place the task.  	 */ -	if (!p->se.on_rq && !task_running(rq, p)) { -		set_task_cpu(p, dest_cpu); -		return 0; -	} - -	init_completion(&req->done); -	req->task = p; -	req->dest_cpu = dest_cpu; -	list_add(&req->list, &rq->migration_queue); - -	return 1; -} - -/* - * wait_task_context_switch -	wait for a thread to complete at least one - *				context switch. - * - * @p must not be current. - */ -void wait_task_context_switch(struct task_struct *p) -{ -	unsigned long nvcsw, nivcsw, flags; -	int running; -	struct rq *rq; - -	nvcsw	= p->nvcsw; -	nivcsw	= p->nivcsw; -	for (;;) { -		/* -		 * The runqueue is assigned before the actual context -		 * switch. We need to take the runqueue lock. -		 * -		 * We could check initially without the lock but it is -		 * very likely that we need to take the lock in every -		 * iteration. -		 */ -		rq = task_rq_lock(p, &flags); -		running = task_running(rq, p); -		task_rq_unlock(rq, &flags); - -		if (likely(!running)) -			break; -		/* -		 * The switch count is incremented before the actual -		 * context switch. We thus wait for two switches to be -		 * sure at least one completed. -		 */ -		if ((p->nvcsw - nvcsw) > 1) -			break; -		if ((p->nivcsw - nivcsw) > 1) -			break; - -		cpu_relax(); -	} +	return p->se.on_rq || task_running(rq, p);  }  /* @@ -2120,7 +2189,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)  		 * just go back and repeat.  		 */  		rq = task_rq_lock(p, &flags); -		trace_sched_wait_task(rq, p); +		trace_sched_wait_task(p);  		running = task_running(rq, p);  		on_rq = p->se.on_rq;  		ncsw = 0; @@ -2194,214 +2263,144 @@ void kick_process(struct task_struct *p)  	preempt_enable();  }  EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p:		the task to evaluate + * @func:	the function to be called + * @info:	the function call argument   * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ -	struct rq *rq = cpu_rq(cpu); -	unsigned long total = weighted_cpuload(cpu); - -	if (type == 0 || !sched_feat(LB_BIAS)) -		return total; - -	return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly   */ -static unsigned long target_load(int cpu, int type) +void task_oncpu_function_call(struct task_struct *p, +			      void (*func) (void *info), void *info)  { -	struct rq *rq = cpu_rq(cpu); -	unsigned long total = weighted_cpuload(cpu); - -	if (type == 0 || !sched_feat(LB_BIAS)) -		return total; +	int cpu; -	return max(rq->cpu_load[type-1], total); +	preempt_disable(); +	cpu = task_cpu(p); +	if (task_curr(p)) +		smp_call_function_single(cpu, func, info, 1); +	preempt_enable();  } +#ifdef CONFIG_SMP  /* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. + * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.   */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +static int select_fallback_rq(int cpu, struct task_struct *p)  { -	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; -	unsigned long min_load = ULONG_MAX, this_load = 0; -	int load_idx = sd->forkexec_idx; -	int imbalance = 100 + (sd->imbalance_pct-100)/2; - -	do { -		unsigned long load, avg_load; -		int local_group; -		int i; - -		/* Skip over this group if it has no CPUs allowed */ -		if (!cpumask_intersects(sched_group_cpus(group), -					&p->cpus_allowed)) -			continue; - -		local_group = cpumask_test_cpu(this_cpu, -					       sched_group_cpus(group)); - -		/* Tally up the load of all CPUs in the group */ -		avg_load = 0; - -		for_each_cpu(i, sched_group_cpus(group)) { -			/* Bias balancing toward cpus of our domain */ -			if (local_group) -				load = source_load(i, load_idx); -			else -				load = target_load(i, load_idx); +	int dest_cpu; +	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); -			avg_load += load; -		} +	/* Look for allowed, online CPU in same node. */ +	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) +		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) +			return dest_cpu; -		/* Adjust by relative CPU power of the group */ -		avg_load = sg_div_cpu_power(group, -				avg_load * SCHED_LOAD_SCALE); +	/* Any allowed, online CPU? */ +	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); +	if (dest_cpu < nr_cpu_ids) +		return dest_cpu; -		if (local_group) { -			this_load = avg_load; -			this = group; -		} else if (avg_load < min_load) { -			min_load = avg_load; -			idlest = group; -		} -	} while (group = group->next, group != sd->groups); +	/* No more Mr. Nice Guy. */ +	dest_cpu = cpuset_cpus_allowed_fallback(p); +	/* +	 * Don't tell them about moving exiting tasks or +	 * kernel threads (both mm NULL), since they never +	 * leave kernel. +	 */ +	if (p->mm && printk_ratelimit()) { +		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", +				task_pid_nr(p), p->comm, cpu); +	} -	if (!idlest || 100*this_load < imbalance*min_load) -		return NULL; -	return idlest; +	return dest_cpu;  }  /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.   */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +static inline +int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)  { -	unsigned long load, min_load = ULONG_MAX; -	int idlest = -1; -	int i; - -	/* Traverse only the allowed CPUs */ -	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { -		load = weighted_cpuload(i); +	int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); -		if (load < min_load || (load == min_load && i == this_cpu)) { -			min_load = load; -			idlest = i; -		} -	} +	/* +	 * In order not to call set_task_cpu() on a blocking task we need +	 * to rely on ttwu() to place the task on a valid ->cpus_allowed +	 * cpu. +	 * +	 * Since this is common to all placement strategies, this lives here. +	 * +	 * [ this allows ->select_task() to simply return task_cpu(p) and +	 *   not worry about this generic constraint ] +	 */ +	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || +		     !cpu_online(cpu))) +		cpu = select_fallback_rq(task_cpu(p), p); -	return idlest; +	return cpu;  } -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) +static void update_avg(u64 *avg, u64 sample)  { -	struct task_struct *t = current; -	struct sched_domain *tmp, *sd = NULL; - -	for_each_domain(cpu, tmp) { -		/* -		 * If power savings logic is enabled for a domain, stop there. -		 */ -		if (tmp->flags & SD_POWERSAVINGS_BALANCE) -			break; -		if (tmp->flags & flag) -			sd = tmp; -	} +	s64 diff = sample - *avg; +	*avg += diff >> 3; +} +#endif -	if (sd) -		update_shares(sd); +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, +				 bool is_sync, bool is_migrate, bool is_local, +				 unsigned long en_flags) +{ +	schedstat_inc(p, se.statistics.nr_wakeups); +	if (is_sync) +		schedstat_inc(p, se.statistics.nr_wakeups_sync); +	if (is_migrate) +		schedstat_inc(p, se.statistics.nr_wakeups_migrate); +	if (is_local) +		schedstat_inc(p, se.statistics.nr_wakeups_local); +	else +		schedstat_inc(p, se.statistics.nr_wakeups_remote); -	while (sd) { -		struct sched_group *group; -		int new_cpu, weight; +	activate_task(rq, p, en_flags); +} -		if (!(sd->flags & flag)) { -			sd = sd->child; -			continue; -		} +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, +					int wake_flags, bool success) +{ +	trace_sched_wakeup(p, success); +	check_preempt_curr(rq, p, wake_flags); -		group = find_idlest_group(sd, t, cpu); -		if (!group) { -			sd = sd->child; -			continue; -		} +	p->state = TASK_RUNNING; +#ifdef CONFIG_SMP +	if (p->sched_class->task_woken) +		p->sched_class->task_woken(rq, p); -		new_cpu = find_idlest_cpu(group, t, cpu); -		if (new_cpu == -1 || new_cpu == cpu) { -			/* Now try balancing at a lower domain level of cpu */ -			sd = sd->child; -			continue; -		} +	if (unlikely(rq->idle_stamp)) { +		u64 delta = rq->clock - rq->idle_stamp; +		u64 max = 2*sysctl_sched_migration_cost; -		/* Now try balancing at a lower domain level of new_cpu */ -		cpu = new_cpu; -		weight = cpumask_weight(sched_domain_span(sd)); -		sd = NULL; -		for_each_domain(cpu, tmp) { -			if (weight <= cpumask_weight(sched_domain_span(tmp))) -				break; -			if (tmp->flags & flag) -				sd = tmp; -		} -		/* while loop will break here if sd == NULL */ +		if (delta > max) +			rq->avg_idle = max; +		else +			update_avg(&rq->avg_idle, delta); +		rq->idle_stamp = 0;  	} - -	return cpu; +#endif +	/* if a worker is waking up, notify workqueue */ +	if ((p->flags & PF_WQ_WORKER) && success) +		wq_worker_waking_up(p, cpu_of(rq));  } -#endif /* CONFIG_SMP */ -  /** - * task_oncpu_function_call - call a function on the cpu on which a task runs - * @p:		the task to evaluate - * @func:	the function to be called - * @info:	the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - */ -void task_oncpu_function_call(struct task_struct *p, -			      void (*func) (void *info), void *info) -{ -	int cpu; - -	preempt_disable(); -	cpu = task_cpu(p); -	if (task_curr(p)) -		smp_call_function_single(cpu, func, info, 1); -	preempt_enable(); -} - -/***   * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened   * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*)   *   * Put it on the run-queue if it's not already there. The "current"   * thread is always on the run-queue (except when the actual @@ -2409,39 +2408,22 @@ void task_oncpu_function_call(struct task_struct *p,   * the simpler "current->state = TASK_RUNNING" to mark yourself   * runnable without the overhead of this.   * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state.   */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, +			  int wake_flags)  {  	int cpu, orig_cpu, this_cpu, success = 0;  	unsigned long flags; -	long old_state; +	unsigned long en_flags = ENQUEUE_WAKEUP;  	struct rq *rq; -	if (!sched_feat(SYNC_WAKEUPS)) -		sync = 0; - -#ifdef CONFIG_SMP -	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { -		struct sched_domain *sd; - -		this_cpu = raw_smp_processor_id(); -		cpu = task_cpu(p); - -		for_each_domain(this_cpu, sd) { -			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -				update_shares(sd); -				break; -			} -		} -	} -#endif +	this_cpu = get_cpu();  	smp_wmb();  	rq = task_rq_lock(p, &flags); -	update_rq_clock(rq); -	old_state = p->state; -	if (!(old_state & state)) +	if (!(p->state & state))  		goto out;  	if (p->se.on_rq) @@ -2449,28 +2431,47 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)  	cpu = task_cpu(p);  	orig_cpu = cpu; -	this_cpu = smp_processor_id();  #ifdef CONFIG_SMP  	if (unlikely(task_running(rq, p)))  		goto out_activate; -	cpu = p->sched_class->select_task_rq(p, sync); -	if (cpu != orig_cpu) { -		set_task_cpu(p, cpu); -		task_rq_unlock(rq, &flags); -		/* might preempt at this point */ -		rq = task_rq_lock(p, &flags); -		old_state = p->state; -		if (!(old_state & state)) -			goto out; -		if (p->se.on_rq) -			goto out_running; +	/* +	 * In order to handle concurrent wakeups and release the rq->lock +	 * we put the task in TASK_WAKING state. +	 * +	 * First fix up the nr_uninterruptible count: +	 */ +	if (task_contributes_to_load(p)) { +		if (likely(cpu_online(orig_cpu))) +			rq->nr_uninterruptible--; +		else +			this_rq()->nr_uninterruptible--; +	} +	p->state = TASK_WAKING; -		this_cpu = smp_processor_id(); -		cpu = task_cpu(p); +	if (p->sched_class->task_waking) { +		p->sched_class->task_waking(rq, p); +		en_flags |= ENQUEUE_WAKING;  	} +	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); +	if (cpu != orig_cpu) +		set_task_cpu(p, cpu); +	__task_rq_unlock(rq); + +	rq = cpu_rq(cpu); +	raw_spin_lock(&rq->lock); + +	/* +	 * We migrated the task without holding either rq->lock, however +	 * since the task is not on the task list itself, nobody else +	 * will try and migrate the task, hence the rq should match the +	 * cpu we just moved it to. +	 */ +	WARN_ON(task_cpu(p) != cpu); +	WARN_ON(p->state != TASK_WAKING); +  #ifdef CONFIG_SCHEDSTATS  	schedstat_inc(rq, ttwu_count);  	if (cpu == this_cpu) @@ -2488,50 +2489,50 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)  out_activate:  #endif /* CONFIG_SMP */ -	schedstat_inc(p, se.nr_wakeups); -	if (sync) -		schedstat_inc(p, se.nr_wakeups_sync); -	if (orig_cpu != cpu) -		schedstat_inc(p, se.nr_wakeups_migrate); -	if (cpu == this_cpu) -		schedstat_inc(p, se.nr_wakeups_local); -	else -		schedstat_inc(p, se.nr_wakeups_remote); -	activate_task(rq, p, 1); +	ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, +		      cpu == this_cpu, en_flags);  	success = 1; - -	/* -	 * Only attribute actual wakeups done by this task. -	 */ -	if (!in_interrupt()) { -		struct sched_entity *se = ¤t->se; -		u64 sample = se->sum_exec_runtime; - -		if (se->last_wakeup) -			sample -= se->last_wakeup; -		else -			sample -= se->start_runtime; -		update_avg(&se->avg_wakeup, sample); - -		se->last_wakeup = se->sum_exec_runtime; -	} -  out_running: -	trace_sched_wakeup(rq, p, success); -	check_preempt_curr(rq, p, sync); - -	p->state = TASK_RUNNING; -#ifdef CONFIG_SMP -	if (p->sched_class->task_wake_up) -		p->sched_class->task_wake_up(rq, p); -#endif +	ttwu_post_activation(p, rq, wake_flags, success);  out:  	task_rq_unlock(rq, &flags); +	put_cpu();  	return success;  }  /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there.  The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task.  this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ +	struct rq *rq = task_rq(p); +	bool success = false; + +	BUG_ON(rq != this_rq()); +	BUG_ON(p == current); +	lockdep_assert_held(&rq->lock); + +	if (!(p->state & TASK_NORMAL)) +		return; + +	if (!p->se.on_rq) { +		if (likely(!task_running(rq, p))) { +			schedstat_inc(rq, ttwu_count); +			schedstat_inc(rq, ttwu_local); +		} +		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); +		success = true; +	} +	ttwu_post_activation(p, rq, 0, success); +} + +/**   * wake_up_process - Wake up a specific process   * @p: The process to be woken up.   * @@ -2565,21 +2566,9 @@ static void __sched_fork(struct task_struct *p)  	p->se.sum_exec_runtime		= 0;  	p->se.prev_sum_exec_runtime	= 0;  	p->se.nr_migrations		= 0; -	p->se.last_wakeup		= 0; -	p->se.avg_overlap		= 0; -	p->se.start_runtime		= 0; -	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;  #ifdef CONFIG_SCHEDSTATS -	p->se.wait_start		= 0; -	p->se.sum_sleep_runtime		= 0; -	p->se.sleep_start		= 0; -	p->se.block_start		= 0; -	p->se.sleep_max			= 0; -	p->se.block_max			= 0; -	p->se.exec_max			= 0; -	p->se.slice_max			= 0; -	p->se.wait_max			= 0; +	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif  	INIT_LIST_HEAD(&p->rt.run_list); @@ -2589,14 +2578,6 @@ static void __sched_fork(struct task_struct *p)  #ifdef CONFIG_PREEMPT_NOTIFIERS  	INIT_HLIST_HEAD(&p->preempt_notifiers);  #endif - -	/* -	 * We mark the process as running here, but have not actually -	 * inserted it onto the runqueue yet. This guarantees that -	 * nobody will actually run it, and a signal or other external -	 * event cannot wake it up and insert it on the runqueue either. -	 */ -	p->state = TASK_RUNNING;  }  /* @@ -2607,19 +2588,57 @@ void sched_fork(struct task_struct *p, int clone_flags)  	int cpu = get_cpu();  	__sched_fork(p); +	/* +	 * We mark the process as running here. This guarantees that +	 * nobody will actually run it, and a signal or other external +	 * event cannot wake it up and insert it on the runqueue either. +	 */ +	p->state = TASK_RUNNING; -#ifdef CONFIG_SMP -	cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif -	set_task_cpu(p, cpu); +	/* +	 * Revert to default priority/policy on fork if requested. +	 */ +	if (unlikely(p->sched_reset_on_fork)) { +		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { +			p->policy = SCHED_NORMAL; +			p->normal_prio = p->static_prio; +		} + +		if (PRIO_TO_NICE(p->static_prio) < 0) { +			p->static_prio = NICE_TO_PRIO(0); +			p->normal_prio = p->static_prio; +			set_load_weight(p); +		} + +		/* +		 * We don't need the reset flag anymore after the fork. It has +		 * fulfilled its duty: +		 */ +		p->sched_reset_on_fork = 0; +	}  	/* -	 * Make sure we do not leak PI boosting priority to the child: +	 * Make sure we do not leak PI boosting priority to the child.  	 */  	p->prio = current->normal_prio; +  	if (!rt_prio(p->prio))  		p->sched_class = &fair_sched_class; +	if (p->sched_class->task_fork) +		p->sched_class->task_fork(p); + +	/* +	 * The child is not yet in the pid-hash so no cgroup attach races, +	 * and the cgroup is pinned to this child due to cgroup_fork() +	 * is ran before sched_fork(). +	 * +	 * Silence PROVE_RCU. +	 */ +	rcu_read_lock(); +	set_task_cpu(p, cpu); +	rcu_read_unlock(); +  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)  	if (likely(sched_info_on()))  		memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -2631,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)  	/* Want to start with kernel preemption disabled. */  	task_thread_info(p)->preempt_count = 1;  #endif +#ifdef CONFIG_SMP  	plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif  	put_cpu();  } @@ -2647,30 +2668,37 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  {  	unsigned long flags;  	struct rq *rq; +	int cpu __maybe_unused = get_cpu(); +#ifdef CONFIG_SMP  	rq = task_rq_lock(p, &flags); -	BUG_ON(p->state != TASK_RUNNING); -	update_rq_clock(rq); +	p->state = TASK_WAKING; -	p->prio = effective_prio(p); +	/* +	 * Fork balancing, do it here and not earlier because: +	 *  - cpus_allowed can change in the fork path +	 *  - any previously selected cpu might disappear through hotplug +	 * +	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock +	 * without people poking at ->cpus_allowed. +	 */ +	cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); +	set_task_cpu(p, cpu); -	if (!p->sched_class->task_new || !current->se.on_rq) { -		activate_task(rq, p, 0); -	} else { -		/* -		 * Let the scheduling class do new task startup -		 * management (if any): -		 */ -		p->sched_class->task_new(rq, p); -		inc_nr_running(rq); -	} -	trace_sched_wakeup_new(rq, p, 1); -	check_preempt_curr(rq, p, 0); +	p->state = TASK_RUNNING; +	task_rq_unlock(rq, &flags); +#endif + +	rq = task_rq_lock(p, &flags); +	activate_task(rq, p, 0); +	trace_sched_wakeup_new(p, 1); +	check_preempt_curr(rq, p, WF_FORK);  #ifdef CONFIG_SMP -	if (p->sched_class->task_wake_up) -		p->sched_class->task_wake_up(rq, p); +	if (p->sched_class->task_woken) +		p->sched_class->task_woken(rq, p);  #endif  	task_rq_unlock(rq, &flags); +	put_cpu();  }  #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2773,12 +2801,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  {  	struct mm_struct *mm = rq->prev_mm;  	long prev_state; -#ifdef CONFIG_SMP -	int post_schedule = 0; - -	if (current->sched_class->needs_post_schedule) -		post_schedule = current->sched_class->needs_post_schedule(rq); -#endif  	rq->prev_mm = NULL; @@ -2795,12 +2817,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	 */  	prev_state = prev->state;  	finish_arch_switch(prev); -	perf_counter_task_sched_in(current, cpu_of(rq)); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +	perf_event_task_sched_in(current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */  	finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP -	if (post_schedule) -		current->sched_class->post_schedule(rq); -#endif  	fire_sched_in_preempt_notifiers(current);  	if (mm) @@ -2815,6 +2839,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	}  } +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ +	if (prev->sched_class->pre_schedule) +		prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ +	if (rq->post_schedule) { +		unsigned long flags; + +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->curr->sched_class->post_schedule) +			rq->curr->sched_class->post_schedule(rq); +		raw_spin_unlock_irqrestore(&rq->lock, flags); + +		rq->post_schedule = 0; +	} +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif +  /**   * schedule_tail - first thing a freshly forked thread must call.   * @prev: the thread we just switched away from. @@ -2825,6 +2885,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)  	struct rq *rq = this_rq();  	finish_task_switch(rq, prev); + +	/* +	 * FIXME: do we need to worry about rq being invalidated by the +	 * task_switch? +	 */ +	post_schedule(rq); +  #ifdef __ARCH_WANT_UNLOCKED_CTXSW  	/* In this case, finish_task_switch does not reenable preemption */  	preempt_enable(); @@ -2844,7 +2911,7 @@ context_switch(struct rq *rq, struct task_struct *prev,  	struct mm_struct *mm, *oldmm;  	prepare_task_switch(rq, prev, next); -	trace_sched_switch(rq, prev, next); +	trace_sched_switch(prev, next);  	mm = next->mm;  	oldmm = prev->active_mm;  	/* @@ -2854,14 +2921,14 @@ context_switch(struct rq *rq, struct task_struct *prev,  	 */  	arch_start_context_switch(prev); -	if (unlikely(!mm)) { +	if (!mm) {  		next->active_mm = oldmm;  		atomic_inc(&oldmm->mm_count);  		enter_lazy_tlb(oldmm, next);  	} else  		switch_mm(oldmm, mm, next); -	if (unlikely(!prev->mm)) { +	if (!prev->mm) {  		prev->active_mm = NULL;  		rq->prev_mm = oldmm;  	} @@ -2942,63 +3009,28 @@ unsigned long nr_iowait(void)  	return sum;  } -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); - -/** - * get_avenrun - get the load average array - * @loads:	pointer to dest load array - * @offset:	offset to add - * @shift:	shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +unsigned long nr_iowait_cpu(int cpu)  { -	loads[0] = (avenrun[0] + offset) << shift; -	loads[1] = (avenrun[1] + offset) << shift; -	loads[2] = (avenrun[2] + offset) << shift; +	struct rq *this = cpu_rq(cpu); +	return atomic_read(&this->nr_iowait);  } -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) +unsigned long this_cpu_load(void)  { -	load *= exp; -	load += active * (FIXED_1 - exp); -	return load >> FSHIFT; +	struct rq *this = this_rq(); +	return this->cpu_load[0];  } -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(void) -{ -	unsigned long upd = calc_load_update + 10; -	long active; - -	if (time_before(jiffies, upd)) -		return; - -	active = atomic_long_read(&calc_load_tasks); -	active = active > 0 ? active * FIXED_1 : 0; - -	avenrun[0] = calc_load(avenrun[0], EXP_1, active); -	avenrun[1] = calc_load(avenrun[1], EXP_5, active); -	avenrun[2] = calc_load(avenrun[2], EXP_15, active); -	calc_load_update += LOAD_FREQ; -} +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); -/* - * Either called from update_cpu_load() or from a cpu going idle - */ -static void calc_load_account_active(struct rq *this_rq) +static long calc_load_fold_active(struct rq *this_rq)  { -	long nr_active, delta; +	long nr_active, delta = 0;  	nr_active = this_rq->nr_running;  	nr_active += (long) this_rq->nr_uninterruptible; @@ -3006,1796 +3038,393 @@ static void calc_load_account_active(struct rq *this_rq)  	if (nr_active != this_rq->calc_load_active) {  		delta = nr_active - this_rq->calc_load_active;  		this_rq->calc_load_active = nr_active; -		atomic_long_add(delta, &calc_load_tasks); -	} -} - -/* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ -	return cpu_rq(cpu)->nr_migrations_in; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - */ -static void update_cpu_load(struct rq *this_rq) -{ -	unsigned long this_load = this_rq->load.weight; -	int i, scale; - -	this_rq->nr_load_updates++; - -	/* Update our load: */ -	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { -		unsigned long old_load, new_load; - -		/* scale is effectively 1 << i now, and >> i divides by scale */ - -		old_load = this_rq->cpu_load[i]; -		new_load = this_load; -		/* -		 * Round up the averaging division if load is increasing. This -		 * prevents us from getting stuck on 9 if the load is 10, for -		 * example. -		 */ -		if (new_load > old_load) -			new_load += scale-1; -		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;  	} -	if (time_after_eq(jiffies, this_rq->calc_load_update)) { -		this_rq->calc_load_update += LOAD_FREQ; -		calc_load_account_active(this_rq); -	} +	return delta;  } -#ifdef CONFIG_SMP - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) -	__acquires(rq1->lock) -	__acquires(rq2->lock) +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active)  { -	BUG_ON(!irqs_disabled()); -	if (rq1 == rq2) { -		spin_lock(&rq1->lock); -		__acquire(rq2->lock);	/* Fake it out ;) */ -	} else { -		if (rq1 < rq2) { -			spin_lock(&rq1->lock); -			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); -		} else { -			spin_lock(&rq2->lock); -			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); -		} -	} -	update_rq_clock(rq1); -	update_rq_clock(rq2); +	load *= exp; +	load += active * (FIXED_1 - exp); +	load += 1UL << (FSHIFT - 1); +	return load >> FSHIFT;  } +#ifdef CONFIG_NO_HZ  /* - * double_rq_unlock - safely unlock two runqueues + * For NO_HZ we delay the active fold to the next LOAD_FREQ update.   * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. + * When making the ILB scale, we should try to pull this in as well.   */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) -	__releases(rq1->lock) -	__releases(rq2->lock) -{ -	spin_unlock(&rq1->lock); -	if (rq1 != rq2) -		spin_unlock(&rq2->lock); -	else -		__release(rq2->lock); -} +static atomic_long_t calc_load_tasks_idle; -/* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. - */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +static void calc_load_account_idle(struct rq *this_rq)  { -	struct migration_req req; -	unsigned long flags; -	struct rq *rq; - -	rq = task_rq_lock(p, &flags); -	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) -	    || unlikely(!cpu_active(dest_cpu))) -		goto out; +	long delta; -	/* force the process onto the specified CPU */ -	if (migrate_task(p, dest_cpu, &req)) { -		/* Need to wait for migration thread (might exit: take ref). */ -		struct task_struct *mt = rq->migration_thread; - -		get_task_struct(mt); -		task_rq_unlock(rq, &flags); -		wake_up_process(mt); -		put_task_struct(mt); -		wait_for_completion(&req.done); - -		return; -	} -out: -	task_rq_unlock(rq, &flags); +	delta = calc_load_fold_active(this_rq); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks_idle);  } -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) +static long calc_load_fold_idle(void)  { -	int new_cpu, this_cpu = get_cpu(); -	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); -	put_cpu(); -	if (new_cpu != this_cpu) -		sched_migrate_task(current, new_cpu); -} +	long delta = 0; -/* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ -static void pull_task(struct rq *src_rq, struct task_struct *p, -		      struct rq *this_rq, int this_cpu) -{ -	deactivate_task(src_rq, p, 0); -	set_task_cpu(p, this_cpu); -	activate_task(this_rq, p, 0);  	/* -	 * Note that idle threads have a prio of MAX_PRIO, for this test -	 * to be always true for them. +	 * Its got a race, we don't care...  	 */ -	check_preempt_curr(this_rq, p, 0); -} +	if (atomic_long_read(&calc_load_tasks_idle)) +		delta = atomic_long_xchg(&calc_load_tasks_idle, 0); -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, -		     struct sched_domain *sd, enum cpu_idle_type idle, -		     int *all_pinned) -{ -	int tsk_cache_hot = 0; -	/* -	 * We do not migrate tasks that are: -	 * 1) running (obviously), or -	 * 2) cannot be migrated to this CPU due to cpus_allowed, or -	 * 3) are cache-hot on their current CPU. -	 */ -	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { -		schedstat_inc(p, se.nr_failed_migrations_affine); -		return 0; -	} -	*all_pinned = 0; - -	if (task_running(rq, p)) { -		schedstat_inc(p, se.nr_failed_migrations_running); -		return 0; -	} - -	/* -	 * Aggressive migration if: -	 * 1) task is cache cold, or -	 * 2) too many balance attempts have failed. -	 */ - -	tsk_cache_hot = task_hot(p, rq->clock, sd); -	if (!tsk_cache_hot || -		sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS -		if (tsk_cache_hot) { -			schedstat_inc(sd, lb_hot_gained[idle]); -			schedstat_inc(p, se.nr_forced_migrations); -		} -#endif -		return 1; -	} - -	if (tsk_cache_hot) { -		schedstat_inc(p, se.nr_failed_migrations_hot); -		return 0; -	} -	return 1; +	return delta;  } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, -	      unsigned long max_load_move, struct sched_domain *sd, -	      enum cpu_idle_type idle, int *all_pinned, -	      int *this_best_prio, struct rq_iterator *iterator) -{ -	int loops = 0, pulled = 0, pinned = 0; -	struct task_struct *p; -	long rem_load_move = max_load_move; - -	if (max_load_move == 0) -		goto out; - -	pinned = 1; - -	/* -	 * Start the load-balancing iterator: -	 */ -	p = iterator->start(iterator->arg); -next: -	if (!p || loops++ > sysctl_sched_nr_migrate) -		goto out; - -	if ((p->se.load.weight >> 1) > rem_load_move || -	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { -		p = iterator->next(iterator->arg); -		goto next; -	} - -	pull_task(busiest, p, this_rq, this_cpu); -	pulled++; -	rem_load_move -= p->se.load.weight; - -#ifdef CONFIG_PREEMPT -	/* -	 * NEWIDLE balancing is a source of latency, so preemptible kernels -	 * will stop after the first task is pulled to minimize the critical -	 * section. -	 */ -	if (idle == CPU_NEWLY_IDLE) -		goto out; -#endif - -	/* -	 * We only want to steal up to the prescribed amount of weighted load. -	 */ -	if (rem_load_move > 0) { -		if (p->prio < *this_best_prio) -			*this_best_prio = p->prio; -		p = iterator->next(iterator->arg); -		goto next; -	} -out: -	/* -	 * Right now, this is one of only two places pull_task() is called, -	 * so we can safely collect pull_task() stats here rather than -	 * inside pull_task(). -	 */ -	schedstat_add(sd, lb_gained[idle], pulled); - -	if (all_pinned) -		*all_pinned = pinned; - -	return max_load_move - rem_load_move; -} - -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x:         base of the power + * @frac_bits: fractional bits of @x + * @n:         power to raise @x to.   * - * Called with both runqueues locked. + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector.   */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, -		      unsigned long max_load_move, -		      struct sched_domain *sd, enum cpu_idle_type idle, -		      int *all_pinned) -{ -	const struct sched_class *class = sched_class_highest; -	unsigned long total_load_moved = 0; -	int this_best_prio = this_rq->curr->prio; - -	do { -		total_load_moved += -			class->load_balance(this_rq, this_cpu, busiest, -				max_load_move - total_load_moved, -				sd, idle, all_pinned, &this_best_prio); -		class = class->next; - -#ifdef CONFIG_PREEMPT -		/* -		 * NEWIDLE balancing is a source of latency, so preemptible -		 * kernels will stop after the first task is pulled to minimize -		 * the critical section. -		 */ -		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) -			break; -#endif -	} while (class && max_load_move > total_load_moved); - -	return total_load_moved > 0; -} - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, -		   struct sched_domain *sd, enum cpu_idle_type idle, -		   struct rq_iterator *iterator) +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)  { -	struct task_struct *p = iterator->start(iterator->arg); -	int pinned = 0; +	unsigned long result = 1UL << frac_bits; -	while (p) { -		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { -			pull_task(busiest, p, this_rq, this_cpu); -			/* -			 * Right now, this is only the second place pull_task() -			 * is called, so we can safely collect pull_task() -			 * stats here rather than inside pull_task(). -			 */ -			schedstat_inc(sd, lb_gained[idle]); - -			return 1; +	if (n) for (;;) { +		if (n & 1) { +			result *= x; +			result += 1UL << (frac_bits - 1); +			result >>= frac_bits;  		} -		p = iterator->next(iterator->arg); +		n >>= 1; +		if (!n) +			break; +		x *= x; +		x += 1UL << (frac_bits - 1); +		x >>= frac_bits;  	} -	return 0; +	return result;  }  /* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e) + *    = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + *  ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + *    = a0 * e^n + a * (1 - e^n)   * - * Called with both runqueues locked. + * [1] application of the geometric series: + * + *              n         1 - x^(n+1) + *     S_n := \Sum x^i = ------------- + *             i=0          1 - x   */ -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, -			 struct sched_domain *sd, enum cpu_idle_type idle) +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, +	    unsigned long active, unsigned int n)  { -	const struct sched_class *class; - -	for (class = sched_class_highest; class; class = class->next) -		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) -			return 1; -	return 0; +	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);  } -/********** Helpers for find_busiest_group ************************/ -/* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * 		during load balancing. - */ -struct sd_lb_stats { -	struct sched_group *busiest; /* Busiest group in this sd */ -	struct sched_group *this;  /* Local group in this sd */ -	unsigned long total_load;  /* Total load of all groups in sd */ -	unsigned long total_pwr;   /*	Total power of all groups in sd */ -	unsigned long avg_load;	   /* Average load across all groups in sd */ - -	/** Statistics of this group */ -	unsigned long this_load; -	unsigned long this_load_per_task; -	unsigned long this_nr_running; - -	/* Statistics of the busiest group */ -	unsigned long max_load; -	unsigned long busiest_load_per_task; -	unsigned long busiest_nr_running; - -	int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	int power_savings_balance; /* Is powersave balance needed for this sd */ -	struct sched_group *group_min; /* Least loaded group in sd */ -	struct sched_group *group_leader; /* Group which relieves group_min */ -	unsigned long min_load_per_task; /* load_per_task in group_min */ -	unsigned long leader_nr_running; /* Nr running of group_leader */ -	unsigned long min_nr_running; /* Nr running of group_min */ -#endif -};  /* - * sg_lb_stats - stats of a sched_group required for load_balancing - */ -struct sg_lb_stats { -	unsigned long avg_load; /*Avg load across the CPUs of the group */ -	unsigned long group_load; /* Total load over the CPUs of the group */ -	unsigned long sum_nr_running; /* Nr tasks running in the group */ -	unsigned long sum_weighted_load; /* Weighted load of group's tasks */ -	unsigned long group_capacity; -	int group_imb; /* Is there an imbalance in the group ? */ -}; - -/** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ -	return cpumask_first(sched_group_cpus(group)); -} - -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, -					enum cpu_idle_type idle) -{ -	int load_idx; - -	switch (idle) { -	case CPU_NOT_IDLE: -		load_idx = sd->busy_idx; -		break; - -	case CPU_NEWLY_IDLE: -		load_idx = sd->newidle_idx; -		break; -	default: -		load_idx = sd->idle_idx; -		break; -	} - -	return load_idx; -} - - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary.   * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed.   */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, -	struct sd_lb_stats *sds, enum cpu_idle_type idle) +static void calc_global_nohz(unsigned long ticks)  { -	/* -	 * Busy processors will not participate in power savings -	 * balance. -	 */ -	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) -		sds->power_savings_balance = 0; -	else { -		sds->power_savings_balance = 1; -		sds->min_nr_running = ULONG_MAX; -		sds->leader_nr_running = 0; -	} -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * 		load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, -	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - -	if (!sds->power_savings_balance) -		return; - -	/* -	 * If the local group is idle or completely loaded -	 * no need to do power savings balance at this domain -	 */ -	if (local_group && (sds->this_nr_running >= sgs->group_capacity || -				!sds->this_nr_running)) -		sds->power_savings_balance = 0; +	long delta, active, n; -	/* -	 * If a group is already running at full capacity or idle, -	 * don't include that group in power savings calculations -	 */ -	if (!sds->power_savings_balance || -		sgs->sum_nr_running >= sgs->group_capacity || -		!sgs->sum_nr_running) +	if (time_before(jiffies, calc_load_update))  		return;  	/* -	 * Calculate the group which has the least non-idle load. -	 * This is the group from where we need to pick up the load -	 * for saving power +	 * If we crossed a calc_load_update boundary, make sure to fold +	 * any pending idle changes, the respective CPUs might have +	 * missed the tick driven calc_load_account_active() update +	 * due to NO_HZ.  	 */ -	if ((sgs->sum_nr_running < sds->min_nr_running) || -	    (sgs->sum_nr_running == sds->min_nr_running && -	     group_first_cpu(group) > group_first_cpu(sds->group_min))) { -		sds->group_min = group; -		sds->min_nr_running = sgs->sum_nr_running; -		sds->min_load_per_task = sgs->sum_weighted_load / -						sgs->sum_nr_running; -	} +	delta = calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks);  	/* -	 * Calculate the group which is almost near its -	 * capacity but still has some space to pick up some load -	 * from other group and save more power +	 * If we were idle for multiple load cycles, apply them.  	 */ -	if (sgs->sum_nr_running > sgs->group_capacity - 1) -		return; +	if (ticks >= LOAD_FREQ) { +		n = ticks / LOAD_FREQ; -	if (sgs->sum_nr_running > sds->leader_nr_running || -	    (sgs->sum_nr_running == sds->leader_nr_running && -	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) { -		sds->group_leader = group; -		sds->leader_nr_running = sgs->sum_nr_running; -	} -} +		active = atomic_long_read(&calc_load_tasks); +		active = active > 0 ? active * FIXED_1 : 0; -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - *	under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, -					int this_cpu, unsigned long *imbalance) -{ -	if (!sds->power_savings_balance) -		return 0; +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); -	if (sds->this != sds->group_leader || -			sds->group_leader == sds->group_min) -		return 0; - -	*imbalance = sds->min_load_per_task; -	sds->busiest = sds->group_min; - -	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { -		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = -			group_first_cpu(sds->group_leader); +		calc_load_update += n * LOAD_FREQ;  	} -	return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, -	struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ -	return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, -	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ -	return; -} - -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, -					int this_cpu, unsigned long *imbalance) -{ -	return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - - -/** - * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @group: sched_group whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu - * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. - * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. - * @balance: Should we balance. - * @sgs: variable to hold the statistics for this group. - */ -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, -			enum cpu_idle_type idle, int load_idx, int *sd_idle, -			int local_group, const struct cpumask *cpus, -			int *balance, struct sg_lb_stats *sgs) -{ -	unsigned long load, max_cpu_load, min_cpu_load; -	int i; -	unsigned int balance_cpu = -1, first_idle_cpu = 0; -	unsigned long sum_avg_load_per_task; -	unsigned long avg_load_per_task; - -	if (local_group) -		balance_cpu = group_first_cpu(group); - -	/* Tally up the load of all CPUs in the group */ -	sum_avg_load_per_task = avg_load_per_task = 0; -	max_cpu_load = 0; -	min_cpu_load = ~0UL; - -	for_each_cpu_and(i, sched_group_cpus(group), cpus) { -		struct rq *rq = cpu_rq(i); - -		if (*sd_idle && rq->nr_running) -			*sd_idle = 0; - -		/* Bias balancing toward cpus of our domain */ -		if (local_group) { -			if (idle_cpu(i) && !first_idle_cpu) { -				first_idle_cpu = 1; -				balance_cpu = i; -			} - -			load = target_load(i, load_idx); -		} else { -			load = source_load(i, load_idx); -			if (load > max_cpu_load) -				max_cpu_load = load; -			if (min_cpu_load > load) -				min_cpu_load = load; -		} - -		sgs->group_load += load; -		sgs->sum_nr_running += rq->nr_running; -		sgs->sum_weighted_load += weighted_cpuload(i); - -		sum_avg_load_per_task += cpu_avg_load_per_task(i); -	} - -	/* -	 * First idle cpu or the first cpu(busiest) in this sched group -	 * is eligible for doing load balancing at this and above -	 * domains. In the newly idle case, we will allow all the cpu's -	 * to do the newly idle load balance. -	 */ -	if (idle != CPU_NEWLY_IDLE && local_group && -	    balance_cpu != this_cpu && balance) { -		*balance = 0; -		return; -	} - -	/* Adjust by relative CPU power of the group */ -	sgs->avg_load = sg_div_cpu_power(group, -			sgs->group_load * SCHED_LOAD_SCALE); - -  	/* -	 * Consider the group unbalanced when the imbalance is larger -	 * than the average weight of two tasks. +	 * Its possible the remainder of the above division also crosses +	 * a LOAD_FREQ period, the regular check in calc_global_load() +	 * which comes after this will take care of that.  	 * -	 * APZ: with cgroup the avg task weight can vary wildly and -	 *      might not be a suitable number - should we keep a -	 *      normalized nr_running number somewhere that negates -	 *      the hierarchy? +	 * Consider us being 11 ticks before a cycle completion, and us +	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will +	 * age us 4 cycles, and the test in calc_global_load() will +	 * pick up the final one.  	 */ -	avg_load_per_task = sg_div_cpu_power(group, -			sum_avg_load_per_task * SCHED_LOAD_SCALE); - -	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) -		sgs->group_imb = 1; - -	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; -  } - -/** - * update_sd_lb_stats - Update sched_group's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. - * @cpus: Set of cpus considered for load balancing. - * @balance: Should we balance. - * @sds: variable to hold the statistics for this sched_domain. - */ -static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, -			enum cpu_idle_type idle, int *sd_idle, -			const struct cpumask *cpus, int *balance, -			struct sd_lb_stats *sds) +#else +static void calc_load_account_idle(struct rq *this_rq)  { -	struct sched_group *group = sd->groups; -	struct sg_lb_stats sgs; -	int load_idx; - -	init_sd_power_savings_stats(sd, sds, idle); -	load_idx = get_sd_load_idx(sd, idle); - -	do { -		int local_group; - -		local_group = cpumask_test_cpu(this_cpu, -					       sched_group_cpus(group)); -		memset(&sgs, 0, sizeof(sgs)); -		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, -				local_group, cpus, balance, &sgs); - -		if (local_group && balance && !(*balance)) -			return; - -		sds->total_load += sgs.group_load; -		sds->total_pwr += group->__cpu_power; - -		if (local_group) { -			sds->this_load = sgs.avg_load; -			sds->this = group; -			sds->this_nr_running = sgs.sum_nr_running; -			sds->this_load_per_task = sgs.sum_weighted_load; -		} else if (sgs.avg_load > sds->max_load && -			   (sgs.sum_nr_running > sgs.group_capacity || -				sgs.group_imb)) { -			sds->max_load = sgs.avg_load; -			sds->busiest = group; -			sds->busiest_nr_running = sgs.sum_nr_running; -			sds->busiest_load_per_task = sgs.sum_weighted_load; -			sds->group_imb = sgs.group_imb; -		} - -		update_sd_power_savings_stats(group, sds, local_group, &sgs); -		group = group->next; -	} while (group != sd->groups); -  } -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - *			amongst the groups of a sched_domain, during - *			load balancing. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. - */ -static inline void fix_small_imbalance(struct sd_lb_stats *sds, -				int this_cpu, unsigned long *imbalance) -{ -	unsigned long tmp, pwr_now = 0, pwr_move = 0; -	unsigned int imbn = 2; - -	if (sds->this_nr_running) { -		sds->this_load_per_task /= sds->this_nr_running; -		if (sds->busiest_load_per_task > -				sds->this_load_per_task) -			imbn = 1; -	} else -		sds->this_load_per_task = -			cpu_avg_load_per_task(this_cpu); - -	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= -			sds->busiest_load_per_task * imbn) { -		*imbalance = sds->busiest_load_per_task; -		return; -	} - -	/* -	 * OK, we don't have enough imbalance to justify moving tasks, -	 * however we may be able to increase total CPU power used by -	 * moving them. -	 */ - -	pwr_now += sds->busiest->__cpu_power * -			min(sds->busiest_load_per_task, sds->max_load); -	pwr_now += sds->this->__cpu_power * -			min(sds->this_load_per_task, sds->this_load); -	pwr_now /= SCHED_LOAD_SCALE; - -	/* Amount of load we'd subtract */ -	tmp = sg_div_cpu_power(sds->busiest, -			sds->busiest_load_per_task * SCHED_LOAD_SCALE); -	if (sds->max_load > tmp) -		pwr_move += sds->busiest->__cpu_power * -			min(sds->busiest_load_per_task, sds->max_load - tmp); - -	/* Amount of load we'd add */ -	if (sds->max_load * sds->busiest->__cpu_power < -		sds->busiest_load_per_task * SCHED_LOAD_SCALE) -		tmp = sg_div_cpu_power(sds->this, -			sds->max_load * sds->busiest->__cpu_power); -	else -		tmp = sg_div_cpu_power(sds->this, -			sds->busiest_load_per_task * SCHED_LOAD_SCALE); -	pwr_move += sds->this->__cpu_power * -			min(sds->this_load_per_task, sds->this_load + tmp); -	pwr_move /= SCHED_LOAD_SCALE; - -	/* Move if we gain throughput */ -	if (pwr_move > pwr_now) -		*imbalance = sds->busiest_load_per_task; +static inline long calc_load_fold_idle(void) +{ +	return 0;  } -/** - * calculate_imbalance - Calculate the amount of imbalance present within the - *			 groups of a given sched_domain during load balance. - * @sds: statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: Cpu for which currently load balance is being performed. - * @imbalance: The variable to store the imbalance. - */ -static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, -		unsigned long *imbalance) +static void calc_global_nohz(unsigned long ticks)  { -	unsigned long max_pull; -	/* -	 * In the presence of smp nice balancing, certain scenarios can have -	 * max load less than avg load(as we skip the groups at or below -	 * its cpu_power, while calculating max_load..) -	 */ -	if (sds->max_load < sds->avg_load) { -		*imbalance = 0; -		return fix_small_imbalance(sds, this_cpu, imbalance); -	} - -	/* Don't want to pull so many tasks that a group would go idle */ -	max_pull = min(sds->max_load - sds->avg_load, -			sds->max_load - sds->busiest_load_per_task); - -	/* How much load to actually move to equalise the imbalance */ -	*imbalance = min(max_pull * sds->busiest->__cpu_power, -		(sds->avg_load - sds->this_load) * sds->this->__cpu_power) -			/ SCHED_LOAD_SCALE; - -	/* -	 * if *imbalance is less than the average load per runnable task -	 * there is no gaurantee that any tasks will be moved so we'll have -	 * a think about bumping its value to force at least one task to be -	 * moved -	 */ -	if (*imbalance < sds->busiest_load_per_task) -		return fix_small_imbalance(sds, this_cpu, imbalance); -  } -/******* find_busiest_group() helpers end here *********************/ +#endif  /** - * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. If there isn't an imbalance, and - * the user has opted for power-savings, it returns a group whose - * CPUs can be put to idle by rebalancing those tasks elsewhere, if - * such a group exists. - * - * Also calculates the amount of weighted load which should be moved - * to restore balance. - * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - *		be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd - * @cpus: The set of CPUs under consideration for load-balancing. - * @balance: Pointer to a variable indicating if this_cpu - *	is the appropriate cpu to perform load balancing at this_level. + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left   * - * Returns:	- the busiest group if imbalance exists. - *		- If no imbalance and user has opted for power-savings balance, - *		   return the least loaded group whose CPUs can be - *		   put to idle by rebalancing its tasks onto our group. - */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, -		   unsigned long *imbalance, enum cpu_idle_type idle, -		   int *sd_idle, const struct cpumask *cpus, int *balance) -{ -	struct sd_lb_stats sds; - -	memset(&sds, 0, sizeof(sds)); - -	/* -	 * Compute the various statistics relavent for load balancing at -	 * this level. -	 */ -	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, -					balance, &sds); - -	/* Cases where imbalance does not exist from POV of this_cpu */ -	/* 1) this_cpu is not the appropriate cpu to perform load balancing -	 *    at this level. -	 * 2) There is no busy sibling group to pull from. -	 * 3) This group is the busiest group. -	 * 4) This group is more busy than the avg busieness at this -	 *    sched_domain. -	 * 5) The imbalance is within the specified limit. -	 * 6) Any rebalance would lead to ping-pong -	 */ -	if (balance && !(*balance)) -		goto ret; - -	if (!sds.busiest || sds.busiest_nr_running == 0) -		goto out_balanced; - -	if (sds.this_load >= sds.max_load) -		goto out_balanced; - -	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - -	if (sds.this_load >= sds.avg_load) -		goto out_balanced; - -	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) -		goto out_balanced; - -	sds.busiest_load_per_task /= sds.busiest_nr_running; -	if (sds.group_imb) -		sds.busiest_load_per_task = -			min(sds.busiest_load_per_task, sds.avg_load); - -	/* -	 * We're trying to get all the cpus to the average_load, so we don't -	 * want to push ourselves above the average load, nor do we wish to -	 * reduce the max loaded cpu below the average load, as either of these -	 * actions would just result in more rebalancing later, and ping-pong -	 * tasks around. Thus we look for the minimum possible imbalance. -	 * Negative imbalances (*we* are more loaded than anyone else) will -	 * be counted as no imbalance for these purposes -- we can't fix that -	 * by pulling tasks to us. Be careful of negative numbers as they'll -	 * appear as very large values with unsigned longs. -	 */ -	if (sds.max_load <= sds.busiest_load_per_task) -		goto out_balanced; - -	/* Looks like there is an imbalance. Compute it */ -	calculate_imbalance(&sds, this_cpu, imbalance); -	return sds.busiest; - -out_balanced: -	/* -	 * There is no obvious imbalance. But check if we can do some balancing -	 * to save power. -	 */ -	if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) -		return sds.busiest; -ret: -	*imbalance = 0; -	return NULL; -} - -/* - * find_busiest_queue - find the busiest runqueue among the cpus in group. - */ -static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, -		   unsigned long imbalance, const struct cpumask *cpus) -{ -	struct rq *busiest = NULL, *rq; -	unsigned long max_load = 0; -	int i; - -	for_each_cpu(i, sched_group_cpus(group)) { -		unsigned long wl; - -		if (!cpumask_test_cpu(i, cpus)) -			continue; - -		rq = cpu_rq(i); -		wl = weighted_cpuload(i); - -		if (rq->nr_running == 1 && wl > imbalance) -			continue; - -		if (wl > max_load) { -			max_load = wl; -			busiest = rq; -		} -	} - -	return busiest; -} - -/* - * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but - * so long as it is large enough. - */ -#define MAX_PINNED_INTERVAL	512 - -/* Working cpumask for load_balance and load_balance_newidle. */ -static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. + * These values are estimates at best, so no need for locking.   */ -static int load_balance(int this_cpu, struct rq *this_rq, -			struct sched_domain *sd, enum cpu_idle_type idle, -			int *balance) +void get_avenrun(unsigned long *loads, unsigned long offset, int shift)  { -	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; -	struct sched_group *group; -	unsigned long imbalance; -	struct rq *busiest; -	unsigned long flags; -	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - -	cpumask_setall(cpus); - -	/* -	 * When power savings policy is enabled for the parent domain, idle -	 * sibling can pick up load irrespective of busy siblings. In this case, -	 * let the state of idle sibling percolate up as CPU_IDLE, instead of -	 * portraying it as CPU_NOT_IDLE. -	 */ -	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && -	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		sd_idle = 1; - -	schedstat_inc(sd, lb_count[idle]); - -redo: -	update_shares(sd); -	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, -				   cpus, balance); - -	if (*balance == 0) -		goto out_balanced; - -	if (!group) { -		schedstat_inc(sd, lb_nobusyg[idle]); -		goto out_balanced; -	} - -	busiest = find_busiest_queue(group, idle, imbalance, cpus); -	if (!busiest) { -		schedstat_inc(sd, lb_nobusyq[idle]); -		goto out_balanced; -	} - -	BUG_ON(busiest == this_rq); - -	schedstat_add(sd, lb_imbalance[idle], imbalance); - -	ld_moved = 0; -	if (busiest->nr_running > 1) { -		/* -		 * Attempt to move tasks. If find_busiest_group has found -		 * an imbalance but busiest->nr_running <= 1, the group is -		 * still unbalanced. ld_moved simply stays zero, so it is -		 * correctly treated as an imbalance. -		 */ -		local_irq_save(flags); -		double_rq_lock(this_rq, busiest); -		ld_moved = move_tasks(this_rq, this_cpu, busiest, -				      imbalance, sd, idle, &all_pinned); -		double_rq_unlock(this_rq, busiest); -		local_irq_restore(flags); - -		/* -		 * some other cpu did the load balance for us. -		 */ -		if (ld_moved && this_cpu != smp_processor_id()) -			resched_cpu(this_cpu); - -		/* All tasks on this runqueue were pinned by CPU affinity */ -		if (unlikely(all_pinned)) { -			cpumask_clear_cpu(cpu_of(busiest), cpus); -			if (!cpumask_empty(cpus)) -				goto redo; -			goto out_balanced; -		} -	} - -	if (!ld_moved) { -		schedstat_inc(sd, lb_failed[idle]); -		sd->nr_balance_failed++; - -		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - -			spin_lock_irqsave(&busiest->lock, flags); - -			/* don't kick the migration_thread, if the curr -			 * task on busiest cpu can't be moved to this_cpu -			 */ -			if (!cpumask_test_cpu(this_cpu, -					      &busiest->curr->cpus_allowed)) { -				spin_unlock_irqrestore(&busiest->lock, flags); -				all_pinned = 1; -				goto out_one_pinned; -			} - -			if (!busiest->active_balance) { -				busiest->active_balance = 1; -				busiest->push_cpu = this_cpu; -				active_balance = 1; -			} -			spin_unlock_irqrestore(&busiest->lock, flags); -			if (active_balance) -				wake_up_process(busiest->migration_thread); - -			/* -			 * We've kicked active balancing, reset the failure -			 * counter. -			 */ -			sd->nr_balance_failed = sd->cache_nice_tries+1; -		} -	} else -		sd->nr_balance_failed = 0; - -	if (likely(!active_balance)) { -		/* We were unbalanced, so reset the balancing interval */ -		sd->balance_interval = sd->min_interval; -	} else { -		/* -		 * If we've begun active balancing, start to back off. This -		 * case may not be covered by the all_pinned logic if there -		 * is only 1 task on the busy runqueue (because we don't call -		 * move_tasks). -		 */ -		if (sd->balance_interval < sd->max_interval) -			sd->balance_interval *= 2; -	} - -	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && -	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		ld_moved = -1; - -	goto out; - -out_balanced: -	schedstat_inc(sd, lb_balanced[idle]); - -	sd->nr_balance_failed = 0; - -out_one_pinned: -	/* tune up the balancing interval */ -	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || -			(sd->balance_interval < sd->max_interval)) -		sd->balance_interval *= 2; - -	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && -	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		ld_moved = -1; -	else -		ld_moved = 0; -out: -	if (ld_moved) -		update_shares(sd); -	return ld_moved; +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift;  }  /* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - * - * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). - * this_rq is locked. + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks.   */ -static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +void calc_global_load(unsigned long ticks)  { -	struct sched_group *group; -	struct rq *busiest = NULL; -	unsigned long imbalance; -	int ld_moved = 0; -	int sd_idle = 0; -	int all_pinned = 0; -	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - -	cpumask_setall(cpus); - -	/* -	 * When power savings policy is enabled for the parent domain, idle -	 * sibling can pick up load irrespective of busy siblings. In this case, -	 * let the state of idle sibling percolate up as IDLE, instead of -	 * portraying it as CPU_NOT_IDLE. -	 */ -	if (sd->flags & SD_SHARE_CPUPOWER && -	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		sd_idle = 1; - -	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); -redo: -	update_shares_locked(this_rq, sd); -	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, -				   &sd_idle, cpus, NULL); -	if (!group) { -		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); -		goto out_balanced; -	} - -	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); -	if (!busiest) { -		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); -		goto out_balanced; -	} - -	BUG_ON(busiest == this_rq); - -	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - -	ld_moved = 0; -	if (busiest->nr_running > 1) { -		/* Attempt to move tasks */ -		double_lock_balance(this_rq, busiest); -		/* this_rq->clock is already updated */ -		update_rq_clock(busiest); -		ld_moved = move_tasks(this_rq, this_cpu, busiest, -					imbalance, sd, CPU_NEWLY_IDLE, -					&all_pinned); -		double_unlock_balance(this_rq, busiest); - -		if (unlikely(all_pinned)) { -			cpumask_clear_cpu(cpu_of(busiest), cpus); -			if (!cpumask_empty(cpus)) -				goto redo; -		} -	} - -	if (!ld_moved) { -		int active_balance = 0; - -		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); -		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && -		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -			return -1; - -		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) -			return -1; - -		if (sd->nr_balance_failed++ < 2) -			return -1; - -		/* -		 * The only task running in a non-idle cpu can be moved to this -		 * cpu in an attempt to completely freeup the other CPU -		 * package. The same method used to move task in load_balance() -		 * have been extended for load_balance_newidle() to speedup -		 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) -		 * -		 * The package power saving logic comes from -		 * find_busiest_group().  If there are no imbalance, then -		 * f_b_g() will return NULL.  However when sched_mc={1,2} then -		 * f_b_g() will select a group from which a running task may be -		 * pulled to this cpu in order to make the other package idle. -		 * If there is no opportunity to make a package idle and if -		 * there are no imbalance, then f_b_g() will return NULL and no -		 * action will be taken in load_balance_newidle(). -		 * -		 * Under normal task pull operation due to imbalance, there -		 * will be more than one task in the source run queue and -		 * move_tasks() will succeed.  ld_moved will be true and this -		 * active balance code will not be triggered. -		 */ - -		/* Lock busiest in correct order while this_rq is held */ -		double_lock_balance(this_rq, busiest); - -		/* -		 * don't kick the migration_thread, if the curr -		 * task on busiest cpu can't be moved to this_cpu -		 */ -		if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { -			double_unlock_balance(this_rq, busiest); -			all_pinned = 1; -			return ld_moved; -		} - -		if (!busiest->active_balance) { -			busiest->active_balance = 1; -			busiest->push_cpu = this_cpu; -			active_balance = 1; -		} - -		double_unlock_balance(this_rq, busiest); -		/* -		 * Should not call ttwu while holding a rq->lock -		 */ -		spin_unlock(&this_rq->lock); -		if (active_balance) -			wake_up_process(busiest->migration_thread); -		spin_lock(&this_rq->lock); - -	} else -		sd->nr_balance_failed = 0; - -	update_shares_locked(this_rq, sd); -	return ld_moved; - -out_balanced: -	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); -	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && -	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		return -1; -	sd->nr_balance_failed = 0; +	long active; -	return 0; -} +	calc_global_nohz(ticks); -/* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -static void idle_balance(int this_cpu, struct rq *this_rq) -{ -	struct sched_domain *sd; -	int pulled_task = 0; -	unsigned long next_balance = jiffies + HZ; - -	for_each_domain(this_cpu, sd) { -		unsigned long interval; +	if (time_before(jiffies, calc_load_update + 10)) +		return; -		if (!(sd->flags & SD_LOAD_BALANCE)) -			continue; +	active = atomic_long_read(&calc_load_tasks); +	active = active > 0 ? active * FIXED_1 : 0; -		if (sd->flags & SD_BALANCE_NEWIDLE) -			/* If we've pulled tasks over stop searching: */ -			pulled_task = load_balance_newidle(this_cpu, this_rq, -							   sd); +	avenrun[0] = calc_load(avenrun[0], EXP_1, active); +	avenrun[1] = calc_load(avenrun[1], EXP_5, active); +	avenrun[2] = calc_load(avenrun[2], EXP_15, active); -		interval = msecs_to_jiffies(sd->balance_interval); -		if (time_after(next_balance, sd->last_balance + interval)) -			next_balance = sd->last_balance + interval; -		if (pulled_task) -			break; -	} -	if (pulled_task || time_after(jiffies, this_rq->next_balance)) { -		/* -		 * We are going idle. next_balance may be set based on -		 * a busy processor. So reset next_balance. -		 */ -		this_rq->next_balance = next_balance; -	} +	calc_load_update += LOAD_FREQ;  }  /* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. + * Called from update_cpu_load() to periodically update this CPU's + * active count.   */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +static void calc_load_account_active(struct rq *this_rq)  { -	int target_cpu = busiest_rq->push_cpu; -	struct sched_domain *sd; -	struct rq *target_rq; +	long delta; -	/* Is there any task to move? */ -	if (busiest_rq->nr_running <= 1) +	if (time_before(jiffies, this_rq->calc_load_update))  		return; -	target_rq = cpu_rq(target_cpu); - -	/* -	 * This condition is "impossible", if it occurs -	 * we need to fix it. Originally reported by -	 * Bjorn Helgaas on a 128-cpu setup. -	 */ -	BUG_ON(busiest_rq == target_rq); - -	/* move a task from busiest_rq to target_rq */ -	double_lock_balance(busiest_rq, target_rq); -	update_rq_clock(busiest_rq); -	update_rq_clock(target_rq); - -	/* Search for an sd spanning us and the target CPU. */ -	for_each_domain(target_cpu, sd) { -		if ((sd->flags & SD_LOAD_BALANCE) && -		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) -				break; -	} - -	if (likely(sd)) { -		schedstat_inc(sd, alb_count); - -		if (move_one_task(target_rq, target_cpu, busiest_rq, -				  sd, CPU_IDLE)) -			schedstat_inc(sd, alb_pushed); -		else -			schedstat_inc(sd, alb_failed); -	} -	double_unlock_balance(busiest_rq, target_rq); -} - -#ifdef CONFIG_NO_HZ -static struct { -	atomic_t load_balancer; -	cpumask_var_t cpu_mask; -	cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { -	.load_balancer = ATOMIC_INIT(-1), -}; +	delta  = calc_load_fold_active(this_rq); +	delta += calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); -int get_nohz_load_balancer(void) -{ -	return atomic_read(&nohz.load_balancer); +	this_rq->calc_load_update += LOAD_FREQ;  } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu:	The cpu whose lowest level of sched domain is to - *		be returned. - * @flag:	The flag to check for the lowest sched_domain - *		for the given cpu. +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load   * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ -	struct sched_domain *sd; - -	for_each_domain(cpu, sd) -		if (sd && (sd->flags & flag)) -			break; - -	return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu:	The cpu whose domains we're iterating over. - * @sd:		variable holding the value of the power_savings_sd - *		for cpu. - * @flag:	The flag to filter the sched_domains to be iterated. + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load   * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ -	for (sd = lowest_flag_domain(cpu, flag); \ -		(sd && (sd->flags & flag)); sd = sd->parent) - -/** - * is_semi_idle_group - Checks if the given sched_group is semi-idle. - * @ilb_group:	group to be checked for semi-idleness + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load   * - * Returns:	1 if the group is semi-idle. 0 otherwise. + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).   * - * We define a sched_group to be semi idle if it has atleast one idle-CPU - * and atleast one non-idle CPU. This helper function checks if the given - * sched_group is semi-idle or not. + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT		7 +static const unsigned char +		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char +		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { +					{0, 0, 0, 0, 0, 0, 0, 0}, +					{64, 32, 8, 0, 0, 0, 0, 0}, +					{96, 72, 40, 12, 1, 0, 0}, +					{112, 98, 75, 43, 15, 1, 0}, +					{120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load.   */ -static inline int is_semi_idle_group(struct sched_group *ilb_group) +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)  { -	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, -					sched_group_cpus(ilb_group)); +	int j = 0; -	/* -	 * A sched_group is semi-idle when it has atleast one busy cpu -	 * and atleast one idle cpu. -	 */ -	if (cpumask_empty(nohz.ilb_grp_nohz_mask)) -		return 0; +	if (!missed_updates) +		return load; -	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) +	if (missed_updates >= degrade_zero_ticks[idx])  		return 0; -	return 1; -} -/** - * find_new_ilb - Finds the optimum idle load balancer for nomination. - * @cpu:	The cpu which is nominating a new idle_load_balancer. - * - * Returns:	Returns the id of the idle load balancer if it exists, - *		Else, returns >= nr_cpu_ids. - * - * This algorithm picks the idle load balancer such that it belongs to a - * semi-idle powersavings sched_domain. The idea is to try and avoid - * completely idle packages/cores just for the purpose of idle load balancing - * when there are other idle cpu's which are better suited for that job. - */ -static int find_new_ilb(int cpu) -{ -	struct sched_domain *sd; -	struct sched_group *ilb_group; - -	/* -	 * Have idle load balancer selection from semi-idle packages only -	 * when power-aware load balancing is enabled -	 */ -	if (!(sched_smt_power_savings || sched_mc_power_savings)) -		goto out_done; - -	/* -	 * Optimize for the case when we have no idle CPUs or only one -	 * idle CPU. Don't walk the sched_domain hierarchy in such cases -	 */ -	if (cpumask_weight(nohz.cpu_mask) < 2) -		goto out_done; - -	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { -		ilb_group = sd->groups; - -		do { -			if (is_semi_idle_group(ilb_group)) -				return cpumask_first(nohz.ilb_grp_nohz_mask); - -			ilb_group = ilb_group->next; - -		} while (ilb_group != sd->groups); -	} - -out_done: -	return cpumask_first(nohz.cpu_mask); -} -#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ -static inline int find_new_ilb(int call_cpu) -{ -	return cpumask_first(nohz.cpu_mask); -} -#endif - -/* - * This routine will try to nominate the ilb (idle load balancing) - * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. - * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. - * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() - */ -int select_nohz_load_balancer(int stop_tick) -{ -	int cpu = smp_processor_id(); - -	if (stop_tick) { -		cpu_rq(cpu)->in_nohz_recently = 1; - -		if (!cpu_active(cpu)) { -			if (atomic_read(&nohz.load_balancer) != cpu) -				return 0; - -			/* -			 * If we are going offline and still the leader, -			 * give up! -			 */ -			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) -				BUG(); - -			return 0; -		} - -		cpumask_set_cpu(cpu, nohz.cpu_mask); - -		/* time for ilb owner also to sleep */ -		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { -			if (atomic_read(&nohz.load_balancer) == cpu) -				atomic_set(&nohz.load_balancer, -1); -			return 0; -		} - -		if (atomic_read(&nohz.load_balancer) == -1) { -			/* make me the ilb owner */ -			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) -				return 1; -		} else if (atomic_read(&nohz.load_balancer) == cpu) { -			int new_ilb; - -			if (!(sched_smt_power_savings || -						sched_mc_power_savings)) -				return 1; -			/* -			 * Check to see if there is a more power-efficient -			 * ilb. -			 */ -			new_ilb = find_new_ilb(cpu); -			if (new_ilb < nr_cpu_ids && new_ilb != cpu) { -				atomic_set(&nohz.load_balancer, -1); -				resched_cpu(new_ilb); -				return 0; -			} -			return 1; -		} -	} else { -		if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) -			return 0; +	if (idx == 1) +		return load >> missed_updates; -		cpumask_clear_cpu(cpu, nohz.cpu_mask); +	while (missed_updates) { +		if (missed_updates % 2) +			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; -		if (atomic_read(&nohz.load_balancer) == cpu) -			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) -				BUG(); +		missed_updates >>= 1; +		j++;  	} -	return 0; +	return load;  } -#endif - -static DEFINE_SPINLOCK(balancing);  /* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies.   */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void update_cpu_load(struct rq *this_rq)  { -	int balance = 1; -	struct rq *rq = cpu_rq(cpu); -	unsigned long interval; -	struct sched_domain *sd; -	/* Earliest time when we have to do rebalance again */ -	unsigned long next_balance = jiffies + 60*HZ; -	int update_next_balance = 0; -	int need_serialize; - -	for_each_domain(cpu, sd) { -		if (!(sd->flags & SD_LOAD_BALANCE)) -			continue; +	unsigned long this_load = this_rq->load.weight; +	unsigned long curr_jiffies = jiffies; +	unsigned long pending_updates; +	int i, scale; -		interval = sd->balance_interval; -		if (idle != CPU_IDLE) -			interval *= sd->busy_factor; +	this_rq->nr_load_updates++; -		/* scale ms to jiffies */ -		interval = msecs_to_jiffies(interval); -		if (unlikely(!interval)) -			interval = 1; -		if (interval > HZ*NR_CPUS/10) -			interval = HZ*NR_CPUS/10; +	/* Avoid repeated calls on same jiffy, when moving in and out of idle */ +	if (curr_jiffies == this_rq->last_load_update_tick) +		return; -		need_serialize = sd->flags & SD_SERIALIZE; +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	this_rq->last_load_update_tick = curr_jiffies; -		if (need_serialize) { -			if (!spin_trylock(&balancing)) -				goto out; -		} +	/* Update our load: */ +	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ +	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { +		unsigned long old_load, new_load; -		if (time_after_eq(jiffies, sd->last_balance + interval)) { -			if (load_balance(cpu, rq, sd, idle, &balance)) { -				/* -				 * We've pulled tasks over so either we're no -				 * longer idle, or one of our SMT siblings is -				 * not idle. -				 */ -				idle = CPU_NOT_IDLE; -			} -			sd->last_balance = jiffies; -		} -		if (need_serialize) -			spin_unlock(&balancing); -out: -		if (time_after(next_balance, sd->last_balance + interval)) { -			next_balance = sd->last_balance + interval; -			update_next_balance = 1; -		} +		/* scale is effectively 1 << i now, and >> i divides by scale */ +		old_load = this_rq->cpu_load[i]; +		old_load = decay_load_missed(old_load, pending_updates - 1, i); +		new_load = this_load;  		/* -		 * Stop the load balance at this level. There is another -		 * CPU in our sched group which is doing load balancing more -		 * actively. +		 * Round up the averaging division if load is increasing. This +		 * prevents us from getting stuck on 9 if the load is 10, for +		 * example.  		 */ -		if (!balance) -			break; +		if (new_load > old_load) +			new_load += scale - 1; + +		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;  	} -	/* -	 * next_balance will be updated only when there is a need. -	 * When the cpu is attached to null domain for ex, it will not be -	 * updated. -	 */ -	if (likely(update_next_balance)) -		rq->next_balance = next_balance; +	sched_avg_update(this_rq);  } -/* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. - */ -static void run_rebalance_domains(struct softirq_action *h) +static void update_cpu_load_active(struct rq *this_rq)  { -	int this_cpu = smp_processor_id(); -	struct rq *this_rq = cpu_rq(this_cpu); -	enum cpu_idle_type idle = this_rq->idle_at_tick ? -						CPU_IDLE : CPU_NOT_IDLE; - -	rebalance_domains(this_cpu, idle); - -#ifdef CONFIG_NO_HZ -	/* -	 * If this cpu is the owner for idle load balancing, then do the -	 * balancing on behalf of the other idle cpus whose ticks are -	 * stopped. -	 */ -	if (this_rq->idle_at_tick && -	    atomic_read(&nohz.load_balancer) == this_cpu) { -		struct rq *rq; -		int balance_cpu; +	update_cpu_load(this_rq); -		for_each_cpu(balance_cpu, nohz.cpu_mask) { -			if (balance_cpu == this_cpu) -				continue; - -			/* -			 * If this cpu gets work to do, stop the load balancing -			 * work being done for other cpus. Next load -			 * balancing owner will pick it up. -			 */ -			if (need_resched()) -				break; - -			rebalance_domains(balance_cpu, CPU_IDLE); - -			rq = cpu_rq(balance_cpu); -			if (time_after(this_rq->next_balance, rq->next_balance)) -				this_rq->next_balance = rq->next_balance; -		} -	} -#endif +	calc_load_account_active(this_rq);  } -static inline int on_null_domain(int cpu) -{ -	return !rcu_dereference(cpu_rq(cpu)->sd); -} +#ifdef CONFIG_SMP  /* - * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint.   */ -static inline void trigger_load_balance(struct rq *rq, int cpu) +void sched_exec(void)  { -#ifdef CONFIG_NO_HZ -	/* -	 * If we were in the nohz mode recently and busy at the current -	 * scheduler tick, then check if we need to nominate new idle -	 * load balancer. -	 */ -	if (rq->in_nohz_recently && !rq->idle_at_tick) { -		rq->in_nohz_recently = 0; - -		if (atomic_read(&nohz.load_balancer) == cpu) { -			cpumask_clear_cpu(cpu, nohz.cpu_mask); -			atomic_set(&nohz.load_balancer, -1); -		} - -		if (atomic_read(&nohz.load_balancer) == -1) { -			int ilb = find_new_ilb(cpu); +	struct task_struct *p = current; +	unsigned long flags; +	struct rq *rq; +	int dest_cpu; -			if (ilb < nr_cpu_ids) -				resched_cpu(ilb); -		} -	} +	rq = task_rq_lock(p, &flags); +	dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); +	if (dest_cpu == smp_processor_id()) +		goto unlock;  	/* -	 * If this cpu is idle and doing idle load balancing for all the -	 * cpus with ticks stopped, is it time for that to stop? +	 * select_task_rq() can race against ->cpus_allowed  	 */ -	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && -	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { -		resched_cpu(cpu); -		return; -	} +	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && +	    likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { +		struct migration_arg arg = { p, dest_cpu }; -	/* -	 * If this cpu is idle and the idle load balancing is done by -	 * someone else, then no need raise the SCHED_SOFTIRQ -	 */ -	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && -	    cpumask_test_cpu(cpu, nohz.cpu_mask)) +		task_rq_unlock(rq, &flags); +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);  		return; -#endif -	/* Don't need to rebalance while attached to NULL domain */ -	if (time_after_eq(jiffies, rq->next_balance) && -	    likely(!on_null_domain(cpu))) -		raise_softirq(SCHED_SOFTIRQ); -} - -#else	/* CONFIG_SMP */ - -/* - * on UP we do not need to balance between CPUs: - */ -static inline void idle_balance(int cpu, struct rq *rq) -{ +	} +unlock: +	task_rq_unlock(rq, &flags);  }  #endif @@ -4816,7 +3445,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)  	if (task_current(rq, p)) {  		update_rq_clock(rq); -		ns = rq->clock - p->se.exec_start; +		ns = rq->clock_task - p->se.exec_start;  		if ((s64)ns < 0)  			ns = 0;  	} @@ -4929,8 +3558,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,  	p->gtime = cputime_add(p->gtime, cputime);  	/* Add guest time to cpustat. */ -	cpustat->user = cputime64_add(cpustat->user, tmp); -	cpustat->guest = cputime64_add(cpustat->guest, tmp); +	if (TASK_NICE(p) > 0) { +		cpustat->nice = cputime64_add(cpustat->nice, tmp); +		cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); +	} else { +		cpustat->user = cputime64_add(cpustat->user, tmp); +		cpustat->guest = cputime64_add(cpustat->guest, tmp); +	}  }  /* @@ -4960,7 +3594,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,  	tmp = cputime_to_cputime64(cputime);  	if (hardirq_count() - hardirq_offset)  		cpustat->irq = cputime64_add(cpustat->irq, tmp); -	else if (softirq_count()) +	else if (in_serving_softirq())  		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);  	else  		cpustat->system = cputime64_add(cpustat->system, tmp); @@ -5008,17 +3642,16 @@ void account_idle_time(cputime_t cputime)   */  void account_process_tick(struct task_struct *p, int user_tick)  { -	cputime_t one_jiffy = jiffies_to_cputime(1); -	cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); +	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);  	struct rq *rq = this_rq();  	if (user_tick) -		account_user_time(p, one_jiffy, one_jiffy_scaled); +		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);  	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) -		account_system_time(p, HARDIRQ_OFFSET, one_jiffy, +		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,  				    one_jiffy_scaled);  	else -		account_idle_time(one_jiffy); +		account_idle_time(cputime_one_jiffy);  }  /* @@ -5046,60 +3679,86 @@ void account_idle_ticks(unsigned long ticks)   * Use precise platform statistics if available:   */  #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)  { -	return p->utime; +	*ut = p->utime; +	*st = p->stime;  } -cputime_t task_stime(struct task_struct *p) +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)  { -	return p->stime; +	struct task_cputime cputime; + +	thread_group_cputime(p, &cputime); + +	*ut = cputime.utime; +	*st = cputime.stime;  }  #else -cputime_t task_utime(struct task_struct *p) + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)  { -	clock_t utime = cputime_to_clock_t(p->utime), -		total = utime + cputime_to_clock_t(p->stime); -	u64 temp; +	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);  	/*  	 * Use CFS's precise accounting:  	 */ -	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); +	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);  	if (total) { +		u64 temp = rtime; +  		temp *= utime;  		do_div(temp, total); -	} -	utime = (clock_t)temp; +		utime = (cputime_t)temp; +	} else +		utime = rtime; -	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); -	return p->prev_utime; +	/* +	 * Compare with previous values, to keep monotonicity: +	 */ +	p->prev_utime = max(p->prev_utime, utime); +	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); + +	*ut = p->prev_utime; +	*st = p->prev_stime;  } -cputime_t task_stime(struct task_struct *p) +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)  { -	clock_t stime; +	struct signal_struct *sig = p->signal; +	struct task_cputime cputime; +	cputime_t rtime, utime, total; -	/* -	 * Use CFS's precise accounting. (we subtract utime from -	 * the total, to make sure the total observed by userspace -	 * grows monotonically - apps rely on that): -	 */ -	stime = nsec_to_clock_t(p->se.sum_exec_runtime) - -			cputime_to_clock_t(task_utime(p)); +	thread_group_cputime(p, &cputime); -	if (stime >= 0) -		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); +	total = cputime_add(cputime.utime, cputime.stime); +	rtime = nsecs_to_cputime(cputime.sum_exec_runtime); -	return p->prev_stime; -} -#endif +	if (total) { +		u64 temp = rtime; -inline cputime_t task_gtime(struct task_struct *p) -{ -	return p->gtime; +		temp *= cputime.utime; +		do_div(temp, total); +		utime = (cputime_t)temp; +	} else +		utime = rtime; + +	sig->prev_utime = max(sig->prev_utime, utime); +	sig->prev_stime = max(sig->prev_stime, +			      cputime_sub(rtime, sig->prev_utime)); + +	*ut = sig->prev_utime; +	*st = sig->prev_stime;  } +#endif  /*   * This function gets called by the timer code, with HZ frequency. @@ -5116,13 +3775,13 @@ void scheduler_tick(void)  	sched_clock_tick(); -	spin_lock(&rq->lock); +	raw_spin_lock(&rq->lock);  	update_rq_clock(rq); -	update_cpu_load(rq); +	update_cpu_load_active(rq);  	curr->sched_class->task_tick(rq, curr, 0); -	spin_unlock(&rq->lock); +	raw_spin_unlock(&rq->lock); -	perf_counter_task_tick(curr, cpu); +	perf_event_task_tick();  #ifdef CONFIG_SMP  	rq->idle_at_tick = idle_cpu(cpu); @@ -5236,23 +3895,8 @@ static inline void schedule_debug(struct task_struct *prev)  static void put_prev_task(struct rq *rq, struct task_struct *prev)  { -	if (prev->state == TASK_RUNNING) { -		u64 runtime = prev->se.sum_exec_runtime; - -		runtime -= prev->se.prev_sum_exec_runtime; -		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - -		/* -		 * In order to avoid avg_overlap growing stale when we are -		 * indeed overlapping and hence not getting put to sleep, grow -		 * the avg_overlap on preemption. -		 * -		 * We use the average preemption runtime because that -		 * correlates to the amount of cache footprint a task can -		 * build up. -		 */ -		update_avg(&prev->se.avg_overlap, runtime); -	} +	if (prev->se.on_rq) +		update_rq_clock(rq);  	prev->sched_class->put_prev_task(rq, prev);  } @@ -5275,17 +3919,13 @@ pick_next_task(struct rq *rq)  			return p;  	} -	class = sched_class_highest; -	for ( ; ; ) { +	for_each_class(class) {  		p = class->pick_next_task(rq);  		if (p)  			return p; -		/* -		 * Will never be NULL as the idle class always -		 * returns a non-NULL p: -		 */ -		class = class->next;  	} + +	BUG(); /* the idle class will always have a runnable task */  }  /* @@ -5302,9 +3942,8 @@ need_resched:  	preempt_disable();  	cpu = smp_processor_id();  	rq = cpu_rq(cpu); -	rcu_qsctr_inc(cpu); +	rcu_note_context_switch(cpu);  	prev = rq->curr; -	switch_count = &prev->nivcsw;  	release_kernel_lock(prev);  need_resched_nonpreemptible: @@ -5314,32 +3953,44 @@ need_resched_nonpreemptible:  	if (sched_feat(HRTICK))  		hrtick_clear(rq); -	spin_lock_irq(&rq->lock); -	update_rq_clock(rq); -	clear_tsk_need_resched(prev); +	raw_spin_lock_irq(&rq->lock); +	switch_count = &prev->nivcsw;  	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { -		if (unlikely(signal_pending_state(prev->state, prev))) +		if (unlikely(signal_pending_state(prev->state, prev))) {  			prev->state = TASK_RUNNING; -		else -			deactivate_task(rq, prev, 1); +		} else { +			/* +			 * If a worker is going to sleep, notify and +			 * ask workqueue whether it wants to wake up a +			 * task to maintain concurrency.  If so, wake +			 * up the task. +			 */ +			if (prev->flags & PF_WQ_WORKER) { +				struct task_struct *to_wakeup; + +				to_wakeup = wq_worker_sleeping(prev, cpu); +				if (to_wakeup) +					try_to_wake_up_local(to_wakeup); +			} +			deactivate_task(rq, prev, DEQUEUE_SLEEP); +		}  		switch_count = &prev->nvcsw;  	} -#ifdef CONFIG_SMP -	if (prev->sched_class->pre_schedule) -		prev->sched_class->pre_schedule(rq, prev); -#endif +	pre_schedule(rq, prev);  	if (unlikely(!rq->nr_running))  		idle_balance(cpu, rq);  	put_prev_task(rq, prev);  	next = pick_next_task(rq); +	clear_tsk_need_resched(prev); +	rq->skip_clock_update = 0;  	if (likely(prev != next)) {  		sched_info_switch(prev, next); -		perf_counter_task_sched_out(prev, next, cpu); +		perf_event_task_sched_out(prev, next);  		rq->nr_switches++;  		rq->curr = next; @@ -5347,15 +3998,19 @@ need_resched_nonpreemptible:  		context_switch(rq, prev, next); /* unlocks the rq */  		/* -		 * the context switch might have flipped the stack from under -		 * us, hence refresh the local variables. +		 * The context switch have flipped the stack from under us +		 * and restored the local variables which were saved when +		 * this task called schedule() in the past. prev == current +		 * is still correct, but it can be moved to another cpu/rq.  		 */  		cpu = smp_processor_id();  		rq = cpu_rq(cpu);  	} else -		spin_unlock_irq(&rq->lock); +		raw_spin_unlock_irq(&rq->lock); + +	post_schedule(rq); -	if (unlikely(reacquire_kernel_lock(current) < 0)) +	if (unlikely(reacquire_kernel_lock(prev)))  		goto need_resched_nonpreemptible;  	preempt_enable_no_resched(); @@ -5364,7 +4019,7 @@ need_resched_nonpreemptible:  }  EXPORT_SYMBOL(schedule); -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER  /*   * Look out! "owner" is an entirely speculative pointer   * access and not reliable. @@ -5384,7 +4039,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)  	 * the mutex owner just released it and exited.  	 */  	if (probe_kernel_address(&owner->cpu, cpu)) -		goto out; +		return 0;  #else  	cpu = owner->cpu;  #endif @@ -5394,14 +4049,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)  	 * the cpu field may no longer be valid.  	 */  	if (cpu >= nr_cpumask_bits) -		goto out; +		return 0;  	/*  	 * We need to validate that we can do a  	 * get_cpu() and that we have the percpu area.  	 */  	if (!cpu_online(cpu)) -		goto out; +		return 0;  	rq = cpu_rq(cpu); @@ -5409,8 +4064,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)  		/*  		 * Owner changed, break to re-assess state.  		 */ -		if (lock->owner != owner) +		if (lock->owner != owner) { +			/* +			 * If the lock has switched to a different owner, +			 * we likely have heavy contention. Return 0 to quit +			 * optimistic spinning and not contend further: +			 */ +			if (lock->owner) +				return 0;  			break; +		}  		/*  		 * Is that owner really running on that cpu? @@ -5418,9 +4081,9 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)  		if (task_thread_info(rq->curr) != owner || need_resched())  			return 0; -		cpu_relax(); +		arch_mutex_cpu_relax();  	} -out: +  	return 1;  }  #endif @@ -5431,7 +4094,7 @@ out:   * off of preempt_enable. Kernel preemptions off return from interrupt   * occur there and call schedule directly.   */ -asmlinkage void __sched preempt_schedule(void) +asmlinkage void __sched notrace preempt_schedule(void)  {  	struct thread_info *ti = current_thread_info(); @@ -5443,9 +4106,9 @@ asmlinkage void __sched preempt_schedule(void)  		return;  	do { -		add_preempt_count(PREEMPT_ACTIVE); +		add_preempt_count_notrace(PREEMPT_ACTIVE);  		schedule(); -		sub_preempt_count(PREEMPT_ACTIVE); +		sub_preempt_count_notrace(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -5486,10 +4149,10 @@ asmlinkage void __sched preempt_schedule_irq(void)  #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  			  void *key)  { -	return try_to_wake_up(curr->private, mode, sync); +	return try_to_wake_up(curr->private, mode, wake_flags);  }  EXPORT_SYMBOL(default_wake_function); @@ -5503,14 +4166,14 @@ EXPORT_SYMBOL(default_wake_function);   * zero in this (rare) case, and we handle it by continuing to scan the queue.   */  static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, int sync, void *key) +			int nr_exclusive, int wake_flags, void *key)  {  	wait_queue_t *curr, *next;  	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {  		unsigned flags = curr->flags; -		if (curr->func(curr, mode, sync, key) && +		if (curr->func(curr, mode, wake_flags, key) &&  				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)  			break;  	} @@ -5544,6 +4207,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)  {  	__wake_up_common(q, mode, 1, 0, NULL);  } +EXPORT_SYMBOL_GPL(__wake_up_locked);  void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)  { @@ -5571,16 +4235,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,  			int nr_exclusive, void *key)  {  	unsigned long flags; -	int sync = 1; +	int wake_flags = WF_SYNC;  	if (unlikely(!q))  		return;  	if (unlikely(!nr_exclusive)) -		sync = 0; +		wake_flags = 0;  	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, sync, key); +	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);  	spin_unlock_irqrestore(&q->lock, flags);  }  EXPORT_SYMBOL_GPL(__wake_up_sync_key); @@ -5643,8 +4307,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)  	if (!x->done) {  		DECLARE_WAITQUEUE(wait, current); -		wait.flags |= WQ_FLAG_EXCLUSIVE; -		__add_wait_queue_tail(&x->wait, &wait); +		__add_wait_queue_tail_exclusive(&x->wait, &wait);  		do {  			if (signal_pending_state(state, current)) {  				timeout = -ERESTARTSYS; @@ -5730,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);   * This waits for either a completion of a specific task to be signaled or for a   * specified timeout to expire. It is interruptible. The timeout is in jiffies.   */ -unsigned long __sched +long __sched  wait_for_completion_interruptible_timeout(struct completion *x,  					  unsigned long timeout)  { @@ -5755,6 +4418,23 @@ int __sched wait_for_completion_killable(struct completion *x)  EXPORT_SYMBOL(wait_for_completion_killable);  /** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, +				     unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/**   *	try_wait_for_completion - try to decrement a completion without blocking   *	@x:	completion structure   * @@ -5768,14 +4448,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);   */  bool try_wait_for_completion(struct completion *x)  { +	unsigned long flags;  	int ret = 1; -	spin_lock_irq(&x->wait.lock); +	spin_lock_irqsave(&x->wait.lock, flags);  	if (!x->done)  		ret = 0;  	else  		x->done--; -	spin_unlock_irq(&x->wait.lock); +	spin_unlock_irqrestore(&x->wait.lock, flags);  	return ret;  }  EXPORT_SYMBOL(try_wait_for_completion); @@ -5790,12 +4471,13 @@ EXPORT_SYMBOL(try_wait_for_completion);   */  bool completion_done(struct completion *x)  { +	unsigned long flags;  	int ret = 1; -	spin_lock_irq(&x->wait.lock); +	spin_lock_irqsave(&x->wait.lock, flags);  	if (!x->done)  		ret = 0; -	spin_unlock_irq(&x->wait.lock); +	spin_unlock_irqrestore(&x->wait.lock, flags);  	return ret;  }  EXPORT_SYMBOL(completion_done); @@ -5863,14 +4545,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	unsigned long flags;  	int oldprio, on_rq, running;  	struct rq *rq; -	const struct sched_class *prev_class = p->sched_class; +	const struct sched_class *prev_class;  	BUG_ON(prio < 0 || prio > MAX_PRIO);  	rq = task_rq_lock(p, &flags); -	update_rq_clock(rq); +	trace_sched_pi_setprio(p, prio);  	oldprio = p->prio; +	prev_class = p->sched_class;  	on_rq = p->se.on_rq;  	running = task_current(rq, p);  	if (on_rq) @@ -5888,7 +4571,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	if (running)  		p->sched_class->set_curr_task(rq);  	if (on_rq) { -		enqueue_task(rq, p, 0); +		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);  		check_class_changed(rq, p, prev_class, oldprio, running);  	} @@ -5910,7 +4593,6 @@ void set_user_nice(struct task_struct *p, long nice)  	 * the task might be in the middle of scheduling on another CPU.  	 */  	rq = task_rq_lock(p, &flags); -	update_rq_clock(rq);  	/*  	 * The RT priorities are set via sched_setscheduler(), but we still  	 * allow the 'normal' nice value to be set - but as expected @@ -5955,7 +4637,7 @@ int can_nice(const struct task_struct *p, const int nice)  	/* convert nice value [19,-20] to rlimit style value [1,40] */  	int nice_rlim = 20 - nice; -	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || +	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||  		capable(CAP_SYS_NICE));  } @@ -6058,22 +4740,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)  	BUG_ON(p->se.on_rq);  	p->policy = policy; -	switch (p->policy) { -	case SCHED_NORMAL: -	case SCHED_BATCH: -	case SCHED_IDLE: -		p->sched_class = &fair_sched_class; -		break; -	case SCHED_FIFO: -	case SCHED_RR: -		p->sched_class = &rt_sched_class; -		break; -	} -  	p->rt_priority = prio;  	p->normal_prio = normal_prio(p);  	/* we are holding p->pi_lock already */  	p->prio = rt_mutex_getprio(p); +	if (rt_prio(p->prio)) +		p->sched_class = &rt_sched_class; +	else +		p->sched_class = &fair_sched_class;  	set_load_weight(p);  } @@ -6094,23 +4768,31 @@ static bool check_same_owner(struct task_struct *p)  }  static int __sched_setscheduler(struct task_struct *p, int policy, -				struct sched_param *param, bool user) +				const struct sched_param *param, bool user)  {  	int retval, oldprio, oldpolicy = -1, on_rq, running;  	unsigned long flags; -	const struct sched_class *prev_class = p->sched_class; +	const struct sched_class *prev_class;  	struct rq *rq; +	int reset_on_fork;  	/* may grab non-irq protected spin_locks */  	BUG_ON(in_interrupt());  recheck:  	/* double check policy once rq lock held */ -	if (policy < 0) +	if (policy < 0) { +		reset_on_fork = p->sched_reset_on_fork;  		policy = oldpolicy = p->policy; -	else if (policy != SCHED_FIFO && policy != SCHED_RR && -			policy != SCHED_NORMAL && policy != SCHED_BATCH && -			policy != SCHED_IDLE) -		return -EINVAL; +	} else { +		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); +		policy &= ~SCHED_RESET_ON_FORK; + +		if (policy != SCHED_FIFO && policy != SCHED_RR && +				policy != SCHED_NORMAL && policy != SCHED_BATCH && +				policy != SCHED_IDLE) +			return -EINVAL; +	} +  	/*  	 * Valid priorities for SCHED_FIFO and SCHED_RR are  	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6128,12 +4810,8 @@ recheck:  	 */  	if (user && !capable(CAP_SYS_NICE)) {  		if (rt_policy(policy)) { -			unsigned long rlim_rtprio; - -			if (!lock_task_sighand(p, &flags)) -				return -ESRCH; -			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; -			unlock_task_sighand(p, &flags); +			unsigned long rlim_rtprio = +					task_rlimit(p, RLIMIT_RTPRIO);  			/* can't set/change the rt policy */  			if (policy != p->policy && !rlim_rtprio) @@ -6154,20 +4832,14 @@ recheck:  		/* can't change other user's priorities */  		if (!check_same_owner(p))  			return -EPERM; -	} -	if (user) { -#ifdef CONFIG_RT_GROUP_SCHED -		/* -		 * Do not allow realtime tasks into groups that have no runtime -		 * assigned. -		 */ -		if (rt_bandwidth_enabled() && rt_policy(policy) && -				task_group(p)->rt_bandwidth.rt_runtime == 0) +		/* Normal users shall not reset the sched_reset_on_fork flag */ +		if (p->sched_reset_on_fork && !reset_on_fork)  			return -EPERM; -#endif +	} -		retval = security_task_setscheduler(p, policy, param); +	if (user) { +		retval = security_task_setscheduler(p);  		if (retval)  			return retval;  	} @@ -6176,20 +4848,44 @@ recheck:  	 * make sure no PI-waiters arrive (or leave) while we are  	 * changing the priority of the task:  	 */ -	spin_lock_irqsave(&p->pi_lock, flags); +	raw_spin_lock_irqsave(&p->pi_lock, flags);  	/*  	 * To be able to change p->policy safely, the apropriate  	 * runqueue lock must be held.  	 */  	rq = __task_rq_lock(p); + +	/* +	 * Changing the policy of the stop threads its a very bad idea +	 */ +	if (p == rq->stop) { +		__task_rq_unlock(rq); +		raw_spin_unlock_irqrestore(&p->pi_lock, flags); +		return -EINVAL; +	} + +#ifdef CONFIG_RT_GROUP_SCHED +	if (user) { +		/* +		 * Do not allow realtime tasks into groups that have no runtime +		 * assigned. +		 */ +		if (rt_bandwidth_enabled() && rt_policy(policy) && +				task_group(p)->rt_bandwidth.rt_runtime == 0) { +			__task_rq_unlock(rq); +			raw_spin_unlock_irqrestore(&p->pi_lock, flags); +			return -EPERM; +		} +	} +#endif +  	/* recheck policy now with rq lock held */  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {  		policy = oldpolicy = -1;  		__task_rq_unlock(rq); -		spin_unlock_irqrestore(&p->pi_lock, flags); +		raw_spin_unlock_irqrestore(&p->pi_lock, flags);  		goto recheck;  	} -	update_rq_clock(rq);  	on_rq = p->se.on_rq;  	running = task_current(rq, p);  	if (on_rq) @@ -6197,7 +4893,10 @@ recheck:  	if (running)  		p->sched_class->put_prev_task(rq, p); +	p->sched_reset_on_fork = reset_on_fork; +  	oldprio = p->prio; +	prev_class = p->sched_class;  	__setscheduler(rq, p, policy, param->sched_priority);  	if (running) @@ -6208,7 +4907,7 @@ recheck:  		check_class_changed(rq, p, prev_class, oldprio, running);  	}  	__task_rq_unlock(rq); -	spin_unlock_irqrestore(&p->pi_lock, flags); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags);  	rt_mutex_adjust_pi(p); @@ -6224,7 +4923,7 @@ recheck:   * NOTE that the task may be already dead.   */  int sched_setscheduler(struct task_struct *p, int policy, -		       struct sched_param *param) +		       const struct sched_param *param)  {  	return __sched_setscheduler(p, policy, param, true);  } @@ -6242,7 +4941,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);   * but our caller might not have that capability.   */  int sched_setscheduler_nocheck(struct task_struct *p, int policy, -			       struct sched_param *param) +			       const struct sched_param *param)  {  	return __sched_setscheduler(p, policy, param, false);  } @@ -6308,19 +5007,20 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)  		return -EINVAL;  	retval = -ESRCH; -	read_lock(&tasklist_lock); +	rcu_read_lock();  	p = find_process_by_pid(pid);  	if (p) {  		retval = security_task_getscheduler(p);  		if (!retval) -			retval = p->policy; +			retval = p->policy +				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);  	} -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	return retval;  }  /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread   * @pid: the pid in question.   * @param: structure containing the RT priority.   */ @@ -6333,7 +5033,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  	if (!param || pid < 0)  		return -EINVAL; -	read_lock(&tasklist_lock); +	rcu_read_lock();  	p = find_process_by_pid(pid);  	retval = -ESRCH;  	if (!p) @@ -6344,7 +5044,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  		goto out_unlock;  	lp.sched_priority = p->rt_priority; -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	/*  	 * This one might sleep, we cannot do it with a spinlock held ... @@ -6354,7 +5054,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  	return retval;  out_unlock: -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	return retval;  } @@ -6365,22 +5065,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	int retval;  	get_online_cpus(); -	read_lock(&tasklist_lock); +	rcu_read_lock();  	p = find_process_by_pid(pid);  	if (!p) { -		read_unlock(&tasklist_lock); +		rcu_read_unlock();  		put_online_cpus();  		return -ESRCH;  	} -	/* -	 * It is not safe to call set_cpus_allowed with the -	 * tasklist_lock held. We will bump the task_struct's -	 * usage count and then drop tasklist_lock. -	 */ +	/* Prevent p going away */  	get_task_struct(p); -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {  		retval = -ENOMEM; @@ -6394,13 +5090,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))  		goto out_unlock; -	retval = security_task_setscheduler(p, 0, NULL); +	retval = security_task_setscheduler(p);  	if (retval)  		goto out_unlock;  	cpuset_cpus_allowed(p, cpus_allowed);  	cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again:  	retval = set_cpus_allowed_ptr(p, new_mask);  	if (!retval) { @@ -6461,10 +5157,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,  long sched_getaffinity(pid_t pid, struct cpumask *mask)  {  	struct task_struct *p; +	unsigned long flags; +	struct rq *rq;  	int retval;  	get_online_cpus(); -	read_lock(&tasklist_lock); +	rcu_read_lock();  	retval = -ESRCH;  	p = find_process_by_pid(pid); @@ -6475,10 +5173,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  	if (retval)  		goto out_unlock; +	rq = task_rq_lock(p, &flags);  	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); +	task_rq_unlock(rq, &flags);  out_unlock: -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	put_online_cpus();  	return retval; @@ -6496,7 +5196,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  	int ret;  	cpumask_var_t mask; -	if (len < cpumask_size()) +	if ((len * BITS_PER_BYTE) < nr_cpu_ids) +		return -EINVAL; +	if (len & (sizeof(unsigned long)-1))  		return -EINVAL;  	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -6504,10 +5206,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  	ret = sched_getaffinity(pid, mask);  	if (ret == 0) { -		if (copy_to_user(user_mask_ptr, mask, cpumask_size())) +		size_t retlen = min_t(size_t, len, cpumask_size()); + +		if (copy_to_user(user_mask_ptr, mask, retlen))  			ret = -EFAULT;  		else -			ret = cpumask_size(); +			ret = retlen;  	}  	free_cpumask_var(mask); @@ -6533,7 +5237,7 @@ SYSCALL_DEFINE0(sched_yield)  	 */  	__release(rq->lock);  	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -	_raw_spin_unlock(&rq->lock); +	do_raw_spin_unlock(&rq->lock);  	preempt_enable_no_resched();  	schedule(); @@ -6541,27 +5245,21 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } +static inline int should_resched(void) +{ +	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} +  static void __cond_resched(void)  { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -	__might_sleep(__FILE__, __LINE__); -#endif -	/* -	 * The BKS might be reacquired before we have dropped -	 * PREEMPT_ACTIVE, which could trigger a second -	 * cond_resched() call. -	 */ -	do { -		add_preempt_count(PREEMPT_ACTIVE); -		schedule(); -		sub_preempt_count(PREEMPT_ACTIVE); -	} while (need_resched()); +	add_preempt_count(PREEMPT_ACTIVE); +	schedule(); +	sub_preempt_count(PREEMPT_ACTIVE);  }  int __sched _cond_resched(void)  { -	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && -					system_state == SYSTEM_RUNNING) { +	if (should_resched()) {  		__cond_resched();  		return 1;  	} @@ -6570,21 +5268,23 @@ int __sched _cond_resched(void)  EXPORT_SYMBOL(_cond_resched);  /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock,   * call schedule, and on return reacquire the lock.   *   * This works OK both with and without CONFIG_PREEMPT. We do strange low-level   * operations here to prevent schedule() from being called twice (once via   * spin_unlock(), once by hand).   */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock)  { -	int resched = need_resched() && system_state == SYSTEM_RUNNING; +	int resched = should_resched();  	int ret = 0; +	lockdep_assert_held(lock); +  	if (spin_needbreak(lock) || resched) {  		spin_unlock(lock); -		if (resched && need_resched()) +		if (resched)  			__cond_resched();  		else  			cpu_relax(); @@ -6593,13 +5293,13 @@ int cond_resched_lock(spinlock_t *lock)  	}  	return ret;  } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void)  {  	BUG_ON(!in_softirq()); -	if (need_resched() && system_state == SYSTEM_RUNNING) { +	if (should_resched()) {  		local_bh_enable();  		__cond_resched();  		local_bh_disable(); @@ -6607,7 +5307,7 @@ int __sched cond_resched_softirq(void)  	}  	return 0;  } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq);  /**   * yield - yield the current processor to other threads. @@ -6625,17 +5325,16 @@ EXPORT_SYMBOL(yield);  /*   * This task is about to go to sleep on IO. Increment rq->nr_iowait so   * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle)   */  void __sched io_schedule(void)  { -	struct rq *rq = &__raw_get_cpu_var(runqueues); +	struct rq *rq = raw_rq();  	delayacct_blkio_start();  	atomic_inc(&rq->nr_iowait); +	current->in_iowait = 1;  	schedule(); +	current->in_iowait = 0;  	atomic_dec(&rq->nr_iowait);  	delayacct_blkio_end();  } @@ -6643,12 +5342,14 @@ EXPORT_SYMBOL(io_schedule);  long __sched io_schedule_timeout(long timeout)  { -	struct rq *rq = &__raw_get_cpu_var(runqueues); +	struct rq *rq = raw_rq();  	long ret;  	delayacct_blkio_start();  	atomic_inc(&rq->nr_iowait); +	current->in_iowait = 1;  	ret = schedule_timeout(timeout); +	current->in_iowait = 0;  	atomic_dec(&rq->nr_iowait);  	delayacct_blkio_end();  	return ret; @@ -6716,6 +5417,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  {  	struct task_struct *p;  	unsigned int time_slice; +	unsigned long flags; +	struct rq *rq;  	int retval;  	struct timespec t; @@ -6723,7 +5426,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  		return -EINVAL;  	retval = -ESRCH; -	read_lock(&tasklist_lock); +	rcu_read_lock();  	p = find_process_by_pid(pid);  	if (!p)  		goto out_unlock; @@ -6732,30 +5435,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  	if (retval)  		goto out_unlock; -	/* -	 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER -	 * tasks that are on an otherwise idle runqueue: -	 */ -	time_slice = 0; -	if (p->policy == SCHED_RR) { -		time_slice = DEF_TIMESLICE; -	} else if (p->policy != SCHED_FIFO) { -		struct sched_entity *se = &p->se; -		unsigned long flags; -		struct rq *rq; +	rq = task_rq_lock(p, &flags); +	time_slice = p->sched_class->get_rr_interval(rq, p); +	task_rq_unlock(rq, &flags); -		rq = task_rq_lock(p, &flags); -		if (rq->cfs.load.weight) -			time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); -		task_rq_unlock(rq, &flags); -	} -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	jiffies_to_timespec(time_slice, &t);  	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;  	return retval;  out_unlock: -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	return retval;  } @@ -6767,7 +5457,7 @@ void sched_show_task(struct task_struct *p)  	unsigned state;  	state = p->state ? __ffs(p->state) + 1 : 0; -	printk(KERN_INFO "%-13.13s %c", p->comm, +	printk(KERN_INFO "%-15.15s %c", p->comm,  		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');  #if BITS_PER_LONG == 32  	if (state == TASK_RUNNING) @@ -6821,7 +5511,7 @@ void show_state_filter(unsigned long state_filter)  	/*  	 * Only show locks if all tasks are dumped:  	 */ -	if (state_filter == -1) +	if (!state_filter)  		debug_show_all_locks();  } @@ -6843,20 +5533,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; -	spin_lock_irqsave(&rq->lock, flags); +	raw_spin_lock_irqsave(&rq->lock, flags);  	__sched_fork(idle); +	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); -	idle->prio = idle->normal_prio = MAX_PRIO;  	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); +	/* +	 * We're having a chicken and egg problem, even though we are +	 * holding rq->lock, the cpu isn't yet set to this cpu so the +	 * lockdep check in task_group() will fail. +	 * +	 * Similar case to sched_fork(). / Alternatively we could +	 * use task_rq_lock() here and obtain the other rq->lock. +	 * +	 * Silence PROVE_RCU +	 */ +	rcu_read_lock();  	__set_task_cpu(idle, cpu); +	rcu_read_unlock();  	rq->curr = rq->idle = idle;  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)  	idle->oncpu = 1;  #endif -	spin_unlock_irqrestore(&rq->lock, flags); +	raw_spin_unlock_irqrestore(&rq->lock, flags);  	/* Set the preempt count _outside_ the spinlocks! */  #if defined(CONFIG_PREEMPT) @@ -6889,39 +5591,57 @@ cpumask_var_t nohz_cpu_mask;   *   * This idea comes from the SD scheduler of Con Kolivas:   */ -static inline void sched_init_granularity(void) +static int get_update_sysctl_factor(void)  { -	unsigned int factor = 1 + ilog2(num_online_cpus()); -	const unsigned long limit = 200000000; +	unsigned int cpus = min_t(int, num_online_cpus(), 8); +	unsigned int factor; -	sysctl_sched_min_granularity *= factor; -	if (sysctl_sched_min_granularity > limit) -		sysctl_sched_min_granularity = limit; +	switch (sysctl_sched_tunable_scaling) { +	case SCHED_TUNABLESCALING_NONE: +		factor = 1; +		break; +	case SCHED_TUNABLESCALING_LINEAR: +		factor = cpus; +		break; +	case SCHED_TUNABLESCALING_LOG: +	default: +		factor = 1 + ilog2(cpus); +		break; +	} -	sysctl_sched_latency *= factor; -	if (sysctl_sched_latency > limit) -		sysctl_sched_latency = limit; +	return factor; +} -	sysctl_sched_wakeup_granularity *= factor; +static void update_sysctl(void) +{ +	unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ +	(sysctl_##name = (factor) * normalized_sysctl_##name) +	SET_SYSCTL(sched_min_granularity); +	SET_SYSCTL(sched_latency); +	SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} -	sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ +	update_sysctl();  }  #ifdef CONFIG_SMP  /*   * This is how migration works:   * - * 1) we queue a struct migration_req structure in the source CPU's - *    runqueue and wake up that CPU's migration thread. - * 2) we down() the locked semaphore => thread blocks. - * 3) migration thread wakes up (implicitly it forces the migrated - *    thread off the CPU) - * 4) it gets the migration request and checks whether the migrated - *    task is still in the wrong runqueue. - * 5) if it's in the wrong runqueue then the migration thread removes + * 1) we invoke migration_cpu_stop() on the target CPU using + *    stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + *    off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes   *    it and puts it into the right queue. - * 6) migration thread up()s the semaphore. - * 7) we wake up and the migration is done. + * 5) stopper completes and stop_one_cpu() returns and the migration + *    is done.   */  /* @@ -6935,13 +5655,25 @@ static inline void sched_init_granularity(void)   */  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  { -	struct migration_req req;  	unsigned long flags;  	struct rq *rq; +	unsigned int dest_cpu;  	int ret = 0; +	/* +	 * Serialize against TASK_WAKING so that ttwu() and wunt() can +	 * drop the rq->lock and still rely on ->cpus_allowed. +	 */ +again: +	while (task_is_waking(p)) +		cpu_relax();  	rq = task_rq_lock(p, &flags); -	if (!cpumask_intersects(new_mask, cpu_online_mask)) { +	if (task_is_waking(p)) { +		task_rq_unlock(rq, &flags); +		goto again; +	} + +	if (!cpumask_intersects(new_mask, cpu_active_mask)) {  		ret = -EINVAL;  		goto out;  	} @@ -6963,11 +5695,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  	if (cpumask_test_cpu(task_cpu(p), new_mask))  		goto out; -	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { +	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); +	if (migrate_task(p, rq)) { +		struct migration_arg arg = { p, dest_cpu };  		/* Need help from migration thread: drop lock and wait. */  		task_rq_unlock(rq, &flags); -		wake_up_process(rq->migration_thread); -		wait_for_completion(&req.done); +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);  		tlb_migrate_finish(p->mm);  		return 0;  	} @@ -6992,7 +5725,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);  static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)  {  	struct rq *rq_dest, *rq_src; -	int ret = 0, on_rq; +	int ret = 0;  	if (unlikely(!cpu_active(dest_cpu)))  		return ret; @@ -7008,12 +5741,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)  	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))  		goto fail; -	on_rq = p->se.on_rq; -	if (on_rq) +	/* +	 * If we're not on a rq, the next wake-up will ensure we're +	 * placed properly. +	 */ +	if (p->se.on_rq) {  		deactivate_task(rq_src, p, 0); - -	set_task_cpu(p, dest_cpu); -	if (on_rq) { +		set_task_cpu(p, dest_cpu);  		activate_task(rq_dest, p, 0);  		check_preempt_curr(rq_dest, p, 0);  	} @@ -7025,109 +5759,39 @@ fail:  }  /* - * migration_thread - this is a highprio system thread that performs - * thread migration by bumping thread off CPU then 'pushing' onto - * another runqueue. + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue.   */ -static int migration_thread(void *data) +static int migration_cpu_stop(void *data)  { -	int cpu = (long)data; -	struct rq *rq; - -	rq = cpu_rq(cpu); -	BUG_ON(rq->migration_thread != current); - -	set_current_state(TASK_INTERRUPTIBLE); -	while (!kthread_should_stop()) { -		struct migration_req *req; -		struct list_head *head; - -		spin_lock_irq(&rq->lock); - -		if (cpu_is_offline(cpu)) { -			spin_unlock_irq(&rq->lock); -			break; -		} - -		if (rq->active_balance) { -			active_load_balance(rq, cpu); -			rq->active_balance = 0; -		} - -		head = &rq->migration_queue; - -		if (list_empty(head)) { -			spin_unlock_irq(&rq->lock); -			schedule(); -			set_current_state(TASK_INTERRUPTIBLE); -			continue; -		} -		req = list_entry(head->next, struct migration_req, list); -		list_del_init(head->next); - -		spin_unlock(&rq->lock); -		__migrate_task(req->task, cpu, req->dest_cpu); -		local_irq_enable(); - -		complete(&req->done); -	} -	__set_current_state(TASK_RUNNING); +	struct migration_arg *arg = data; +	/* +	 * The original target cpu might have gone down and we might +	 * be on another cpu but it doesn't matter. +	 */ +	local_irq_disable(); +	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); +	local_irq_enable();  	return 0;  }  #ifdef CONFIG_HOTPLUG_CPU -static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) -{ -	int ret; - -	local_irq_disable(); -	ret = __migrate_task(p, src_cpu, dest_cpu); -	local_irq_enable(); -	return ret; -} -  /* - * Figure out where task on dead CPU should go, use force if necessary. + * Ensures that the idle task is using init_mm right before its cpu goes + * offline.   */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void idle_task_exit(void)  { -	int dest_cpu; -	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); - -again: -	/* Look for allowed, online CPU in same node. */ -	for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) -		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) -			goto move; - -	/* Any allowed, online CPU? */ -	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); -	if (dest_cpu < nr_cpu_ids) -		goto move; - -	/* No more Mr. Nice Guy. */ -	if (dest_cpu >= nr_cpu_ids) { -		cpuset_cpus_allowed_locked(p, &p->cpus_allowed); -		dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); +	struct mm_struct *mm = current->active_mm; -		/* -		 * Don't tell them about moving exiting tasks or -		 * kernel threads (both mm NULL), since they never -		 * leave kernel. -		 */ -		if (p->mm && printk_ratelimit()) { -			printk(KERN_INFO "process %d (%s) no " -			       "longer affine to cpu%d\n", -			       task_pid_nr(p), p->comm, dead_cpu); -		} -	} +	BUG_ON(cpu_online(smp_processor_id())); -move: -	/* It can have affinity changed while we were choosing. */ -	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) -		goto again; +	if (mm != &init_mm) +		switch_mm(mm, &init_mm, current); +	mmdrop(mm);  }  /* @@ -7139,130 +5803,70 @@ move:   */  static void migrate_nr_uninterruptible(struct rq *rq_src)  { -	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); -	unsigned long flags; +	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); -	local_irq_save(flags); -	double_rq_lock(rq_src, rq_dest);  	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;  	rq_src->nr_uninterruptible = 0; -	double_rq_unlock(rq_src, rq_dest); -	local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ -	struct task_struct *p, *t; - -	read_lock(&tasklist_lock); - -	do_each_thread(t, p) { -		if (p == current) -			continue; - -		if (task_cpu(p) == src_cpu) -			move_task_off_dead_cpu(src_cpu, p); -	} while_each_thread(t, p); - -	read_unlock(&tasklist_lock);  }  /* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. + * remove the tasks which were accounted by rq from calc_load_tasks.   */ -void sched_idle_next(void) +static void calc_global_load_remove(struct rq *rq)  { -	int this_cpu = smp_processor_id(); -	struct rq *rq = cpu_rq(this_cpu); -	struct task_struct *p = rq->idle; -	unsigned long flags; - -	/* cpu has to be offline */ -	BUG_ON(cpu_online(this_cpu)); - -	/* -	 * Strictly not necessary since rest of the CPUs are stopped by now -	 * and interrupts disabled on the current cpu. -	 */ -	spin_lock_irqsave(&rq->lock, flags); - -	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - -	update_rq_clock(rq); -	activate_task(rq, p, 0); - -	spin_unlock_irqrestore(&rq->lock, flags); +	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +	rq->calc_load_active = 0;  }  /* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts.   */ -void idle_task_exit(void) -{ -	struct mm_struct *mm = current->active_mm; - -	BUG_ON(cpu_online(smp_processor_id())); - -	if (mm != &init_mm) -		switch_mm(mm, &init_mm, current); -	mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +static void migrate_tasks(unsigned int dead_cpu)  {  	struct rq *rq = cpu_rq(dead_cpu); - -	/* Must be exiting, otherwise would be on tasklist. */ -	BUG_ON(!p->exit_state); - -	/* Cannot have done final schedule yet: would have vanished. */ -	BUG_ON(p->state == TASK_DEAD); - -	get_task_struct(p); +	struct task_struct *next, *stop = rq->stop; +	int dest_cpu;  	/* -	 * Drop lock around migration; if someone else moves it, -	 * that's OK. No task can be added to this CPU, so iteration is -	 * fine. +	 * Fudge the rq selection such that the below task selection loop +	 * doesn't get stuck on the currently eligible stop task. +	 * +	 * We're currently inside stop_machine() and the rq is either stuck +	 * in the stop_machine_cpu_stop() loop, or we're executing this code, +	 * either way we should never end up calling schedule() until we're +	 * done here.  	 */ -	spin_unlock_irq(&rq->lock); -	move_task_off_dead_cpu(dead_cpu, p); -	spin_lock_irq(&rq->lock); - -	put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ -	struct rq *rq = cpu_rq(dead_cpu); -	struct task_struct *next; +	rq->stop = NULL;  	for ( ; ; ) { -		if (!rq->nr_running) +		/* +		 * There's this thread running, bail when that's the only +		 * remaining thread. +		 */ +		if (rq->nr_running == 1)  			break; -		update_rq_clock(rq); +  		next = pick_next_task(rq); -		if (!next) -			break; +		BUG_ON(!next);  		next->sched_class->put_prev_task(rq, next); -		migrate_dead(dead_cpu, next); +		/* Find suitable destination for @next, with force if needed. */ +		dest_cpu = select_fallback_rq(dead_cpu, next); +		raw_spin_unlock(&rq->lock); + +		__migrate_task(next, dead_cpu, dest_cpu); + +		raw_spin_lock(&rq->lock);  	} -} -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ -	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +	rq->stop = stop;  } +  #endif /* CONFIG_HOTPLUG_CPU */  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7272,17 +5876,16 @@ static struct ctl_table sd_ctl_dir[] = {  		.procname	= "sched_domain",  		.mode		= 0555,  	}, -	{0, }, +	{}  };  static struct ctl_table sd_ctl_root[] = {  	{ -		.ctl_name	= CTL_KERN,  		.procname	= "kernel",  		.mode		= 0555,  		.child		= sd_ctl_dir,  	}, -	{0, }, +	{}  };  static struct ctl_table *sd_alloc_ctl_entry(int n) @@ -7392,7 +5995,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)  static struct ctl_table_header *sd_sysctl_header;  static void register_sched_domain_sysctl(void)  { -	int i, cpu_num = num_online_cpus(); +	int i, cpu_num = num_possible_cpus();  	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);  	char buf[32]; @@ -7402,7 +6005,7 @@ static void register_sched_domain_sysctl(void)  	if (entry == NULL)  		return; -	for_each_online_cpu(i) { +	for_each_possible_cpu(i) {  		snprintf(buf, 32, "cpu%d", i);  		entry->procname = kstrdup(buf, GFP_KERNEL);  		entry->mode = 0555; @@ -7469,108 +6072,41 @@ static void set_rq_offline(struct rq *rq)  static int __cpuinit  migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  { -	struct task_struct *p;  	int cpu = (long)hcpu;  	unsigned long flags; -	struct rq *rq; +	struct rq *rq = cpu_rq(cpu); -	switch (action) { +	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_UP_PREPARE: -	case CPU_UP_PREPARE_FROZEN: -		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); -		if (IS_ERR(p)) -			return NOTIFY_BAD; -		kthread_bind(p, cpu); -		/* Must be high prio: stop_machine expects to yield to it. */ -		rq = task_rq_lock(p, &flags); -		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); -		task_rq_unlock(rq, &flags); -		get_task_struct(p); -		cpu_rq(cpu)->migration_thread = p; +		rq->calc_load_update = calc_load_update;  		break;  	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN: -		/* Strictly unnecessary, as first user will wake it. */ -		wake_up_process(cpu_rq(cpu)->migration_thread); -  		/* Update our root-domain */ -		rq = cpu_rq(cpu); -		spin_lock_irqsave(&rq->lock, flags); -		rq->calc_load_update = calc_load_update; -		rq->calc_load_active = 0; +		raw_spin_lock_irqsave(&rq->lock, flags);  		if (rq->rd) {  			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));  			set_rq_online(rq);  		} -		spin_unlock_irqrestore(&rq->lock, flags); +		raw_spin_unlock_irqrestore(&rq->lock, flags);  		break;  #ifdef CONFIG_HOTPLUG_CPU -	case CPU_UP_CANCELED: -	case CPU_UP_CANCELED_FROZEN: -		if (!cpu_rq(cpu)->migration_thread) -			break; -		/* Unbind it from offline cpu so it can run. Fall thru. */ -		kthread_bind(cpu_rq(cpu)->migration_thread, -			     cpumask_any(cpu_online_mask)); -		kthread_stop(cpu_rq(cpu)->migration_thread); -		put_task_struct(cpu_rq(cpu)->migration_thread); -		cpu_rq(cpu)->migration_thread = NULL; -		break; - -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ -		migrate_live_tasks(cpu); -		rq = cpu_rq(cpu); -		kthread_stop(rq->migration_thread); -		put_task_struct(rq->migration_thread); -		rq->migration_thread = NULL; -		/* Idle task back to normal (off runqueue, low prio) */ -		spin_lock_irq(&rq->lock); -		update_rq_clock(rq); -		deactivate_task(rq, rq->idle, 0); -		rq->idle->static_prio = MAX_PRIO; -		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0); -		rq->idle->sched_class = &idle_sched_class; -		migrate_dead_tasks(cpu); -		spin_unlock_irq(&rq->lock); -		cpuset_unlock(); -		migrate_nr_uninterruptible(rq); -		BUG_ON(rq->nr_running != 0); -		calc_global_load_remove(rq); -		/* -		 * No need to migrate the tasks: it was best-effort if -		 * they didn't take sched_hotcpu_mutex. Just wake up -		 * the requestors. -		 */ -		spin_lock_irq(&rq->lock); -		while (!list_empty(&rq->migration_queue)) { -			struct migration_req *req; - -			req = list_entry(rq->migration_queue.next, -					 struct migration_req, list); -			list_del_init(&req->list); -			spin_unlock_irq(&rq->lock); -			complete(&req->done); -			spin_lock_irq(&rq->lock); -		} -		spin_unlock_irq(&rq->lock); -		break; -  	case CPU_DYING: -	case CPU_DYING_FROZEN:  		/* Update our root-domain */ -		rq = cpu_rq(cpu); -		spin_lock_irqsave(&rq->lock, flags); +		raw_spin_lock_irqsave(&rq->lock, flags);  		if (rq->rd) {  			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));  			set_rq_offline(rq);  		} -		spin_unlock_irqrestore(&rq->lock, flags); +		migrate_tasks(cpu); +		BUG_ON(rq->nr_running != 1); /* the migration thread */ +		raw_spin_unlock_irqrestore(&rq->lock, flags); + +		migrate_nr_uninterruptible(rq); +		calc_global_load_remove(rq);  		break;  #endif  	} @@ -7580,25 +6116,54 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  /*   * Register at high priority so that task migration (migrate_all_tasks)   * happens before everything else.  This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though.   */  static struct notifier_block __cpuinitdata migration_notifier = {  	.notifier_call = migration_call, -	.priority = 10 +	.priority = CPU_PRI_MIGRATION,  }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +				      unsigned long action, void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +	case CPU_DOWN_FAILED: +		set_cpu_active((long)hcpu, true); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +					unsigned long action, void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		set_cpu_active((long)hcpu, false); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} +  static int __init migration_init(void)  {  	void *cpu = (void *)(long)smp_processor_id();  	int err; -	/* Start one for the boot CPU: */ +	/* Initialize migration for the boot CPU */  	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);  	BUG_ON(err == NOTIFY_BAD);  	migration_call(&migration_notifier, CPU_ONLINE, cpu);  	register_cpu_notifier(&migration_notifier); -	return err; +	/* Register cpu active notifiers */ +	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); +	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + +	return 0;  }  early_initcall(migration_init);  #endif @@ -7607,6 +6172,16 @@ early_initcall(migration_init);  #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ +	sched_domain_debug_enabled = 1; + +	return 0; +} +early_param("sched_debug", sched_domain_debug_setup); +  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  				  struct cpumask *groupmask)  { @@ -7645,7 +6220,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  			break;  		} -		if (!group->__cpu_power) { +		if (!group->cpu_power) {  			printk(KERN_CONT "\n");  			printk(KERN_ERR "ERROR: domain->cpu_power not "  					"set\n"); @@ -7669,9 +6244,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));  		printk(KERN_CONT " %s", str); -		if (group->__cpu_power != SCHED_LOAD_SCALE) { -			printk(KERN_CONT " (__cpu_power = %d)", -				group->__cpu_power); +		if (group->cpu_power != SCHED_LOAD_SCALE) { +			printk(KERN_CONT " (cpu_power = %d)", +				group->cpu_power);  		}  		group = group->next; @@ -7693,6 +6268,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  	cpumask_var_t groupmask;  	int level = 0; +	if (!sched_domain_debug_enabled) +		return; +  	if (!sd) {  		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);  		return; @@ -7736,9 +6314,7 @@ static int sd_degenerate(struct sched_domain *sd)  	}  	/* Following flags don't use groups */ -	if (sd->flags & (SD_WAKE_IDLE | -			 SD_WAKE_AFFINE | -			 SD_WAKE_BALANCE)) +	if (sd->flags & (SD_WAKE_AFFINE))  		return 0;  	return 1; @@ -7755,10 +6331,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))  		return 0; -	/* Does parent contain flags not in child? */ -	/* WAKE_BALANCE is a subset of WAKE_AFFINE */ -	if (cflags & SD_WAKE_AFFINE) -		pflags &= ~SD_WAKE_BALANCE;  	/* Flags needing groups don't count if only 1 group in parent */  	if (parent->groups == parent->groups->next) {  		pflags &= ~(SD_LOAD_BALANCE | @@ -7778,6 +6350,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  static void free_rootdomain(struct root_domain *rd)  { +	synchronize_sched(); +  	cpupri_cleanup(&rd->cpupri);  	free_cpumask_var(rd->rto_mask); @@ -7791,7 +6365,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)  	struct root_domain *old_rd = NULL;  	unsigned long flags; -	spin_lock_irqsave(&rq->lock, flags); +	raw_spin_lock_irqsave(&rq->lock, flags);  	if (rq->rd) {  		old_rd = rq->rd; @@ -7814,32 +6388,27 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)  	rq->rd = rd;  	cpumask_set_cpu(rq->cpu, rd->span); -	if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) +	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))  		set_rq_online(rq); -	spin_unlock_irqrestore(&rq->lock, flags); +	raw_spin_unlock_irqrestore(&rq->lock, flags);  	if (old_rd)  		free_rootdomain(old_rd);  } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd)  { -	gfp_t gfp = GFP_KERNEL; -  	memset(rd, 0, sizeof(*rd)); -	if (bootmem) -		gfp = GFP_NOWAIT; - -	if (!alloc_cpumask_var(&rd->span, gfp)) +	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))  		goto out; -	if (!alloc_cpumask_var(&rd->online, gfp)) +	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))  		goto free_span; -	if (!alloc_cpumask_var(&rd->rto_mask, gfp)) +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))  		goto free_online; -	if (cpupri_init(&rd->cpupri, bootmem) != 0) +	if (cpupri_init(&rd->cpupri) != 0)  		goto free_rto_mask;  	return 0; @@ -7855,7 +6424,7 @@ out:  static void init_defrootdomain(void)  { -	init_rootdomain(&def_root_domain, true); +	init_rootdomain(&def_root_domain);  	atomic_set(&def_root_domain.refcount, 1);  } @@ -7868,7 +6437,7 @@ static struct root_domain *alloc_rootdomain(void)  	if (!rd)  		return NULL; -	if (init_rootdomain(rd, false) != 0) { +	if (init_rootdomain(rd) != 0) {  		kfree(rd);  		return NULL;  	} @@ -7886,6 +6455,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  	struct rq *rq = cpu_rq(cpu);  	struct sched_domain *tmp; +	for (tmp = sd; tmp; tmp = tmp->parent) +		tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); +  	/* Remove the sched domains which do not contribute to scheduling. */  	for (tmp = sd; tmp; ) {  		struct sched_domain *parent = tmp->parent; @@ -7918,6 +6490,7 @@ static cpumask_var_t cpu_isolated_map;  /* Setup the mask of cpus configured for isolated domains */  static int __init isolated_cpu_setup(char *str)  { +	alloc_bootmem_cpumask_var(&cpu_isolated_map);  	cpulist_parse(str, cpu_isolated_map);  	return 1;  } @@ -7956,7 +6529,7 @@ init_sched_build_groups(const struct cpumask *span,  			continue;  		cpumask_clear(sched_group_cpus(sg)); -		sg->__cpu_power = 0; +		sg->cpu_power = 0;  		for_each_cpu(j, span) {  			if (group_fn(j, cpu_map, NULL, tmpmask) != group) @@ -8064,19 +6637,54 @@ struct static_sched_domain {  	DECLARE_BITMAP(span, CONFIG_NR_CPUS);  }; +struct s_data { +#ifdef CONFIG_NUMA +	int			sd_allnodes; +	cpumask_var_t		domainspan; +	cpumask_var_t		covered; +	cpumask_var_t		notcovered; +#endif +	cpumask_var_t		nodemask; +	cpumask_var_t		this_sibling_map; +	cpumask_var_t		this_core_map; +	cpumask_var_t		this_book_map; +	cpumask_var_t		send_covered; +	cpumask_var_t		tmpmask; +	struct sched_group	**sched_group_nodes; +	struct root_domain	*rd; +}; + +enum s_alloc { +	sa_sched_groups = 0, +	sa_rootdomain, +	sa_tmpmask, +	sa_send_covered, +	sa_this_book_map, +	sa_this_core_map, +	sa_this_sibling_map, +	sa_nodemask, +	sa_sched_group_nodes, +#ifdef CONFIG_NUMA +	sa_notcovered, +	sa_covered, +	sa_domainspan, +#endif +	sa_none, +}; +  /*   * SMT sched-domains:   */  #ifdef CONFIG_SCHED_SMT  static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_group, sched_groups);  static int  cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,  		 struct sched_group **sg, struct cpumask *unused)  {  	if (sg) -		*sg = &per_cpu(sched_group_cpus, cpu).sg; +		*sg = &per_cpu(sched_groups, cpu).sg;  	return cpu;  }  #endif /* CONFIG_SCHED_SMT */ @@ -8087,31 +6695,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,  #ifdef CONFIG_SCHED_MC  static DEFINE_PER_CPU(struct static_sched_domain, core_domains);  static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)  static int  cpu_to_core_group(int cpu, const struct cpumask *cpu_map,  		  struct sched_group **sg, struct cpumask *mask)  {  	int group; - +#ifdef CONFIG_SCHED_SMT  	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);  	group = cpumask_first(mask); +#else +	group = cpu; +#endif  	if (sg)  		*sg = &per_cpu(sched_group_core, group).sg;  	return group;  } -#elif defined(CONFIG_SCHED_MC) +#endif /* CONFIG_SCHED_MC */ + +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); +  static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, -		  struct sched_group **sg, struct cpumask *unused) +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, +		  struct sched_group **sg, struct cpumask *mask)  { +	int group = cpu; +#ifdef CONFIG_SCHED_MC +	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); +	group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) +	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); +	group = cpumask_first(mask); +#endif  	if (sg) -		*sg = &per_cpu(sched_group_core, cpu).sg; -	return cpu; +		*sg = &per_cpu(sched_group_book, group).sg; +	return group;  } -#endif +#endif /* CONFIG_SCHED_BOOK */  static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);  static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -8121,7 +6746,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,  		  struct sched_group **sg, struct cpumask *mask)  {  	int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK +	cpumask_and(mask, cpu_book_mask(cpu), cpu_map); +	group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC)  	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);  	group = cpumask_first(mask);  #elif defined(CONFIG_SCHED_SMT) @@ -8181,11 +6809,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)  				continue;  			} -			sg_inc_cpu_power(sg, sd->groups->__cpu_power); +			sg->cpu_power += sd->groups->cpu_power;  		}  		sg = sg->next;  	} while (sg != group_head);  } + +static int build_numa_sched_groups(struct s_data *d, +				   const struct cpumask *cpu_map, int num) +{ +	struct sched_domain *sd; +	struct sched_group *sg, *prev; +	int n, j; + +	cpumask_clear(d->covered); +	cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); +	if (cpumask_empty(d->nodemask)) { +		d->sched_group_nodes[num] = NULL; +		goto out; +	} + +	sched_domain_node_span(num, d->domainspan); +	cpumask_and(d->domainspan, d->domainspan, cpu_map); + +	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), +			  GFP_KERNEL, num); +	if (!sg) { +		printk(KERN_WARNING "Can not alloc domain group for node %d\n", +		       num); +		return -ENOMEM; +	} +	d->sched_group_nodes[num] = sg; + +	for_each_cpu(j, d->nodemask) { +		sd = &per_cpu(node_domains, j).sd; +		sd->groups = sg; +	} + +	sg->cpu_power = 0; +	cpumask_copy(sched_group_cpus(sg), d->nodemask); +	sg->next = sg; +	cpumask_or(d->covered, d->covered, d->nodemask); + +	prev = sg; +	for (j = 0; j < nr_node_ids; j++) { +		n = (num + j) % nr_node_ids; +		cpumask_complement(d->notcovered, d->covered); +		cpumask_and(d->tmpmask, d->notcovered, cpu_map); +		cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); +		if (cpumask_empty(d->tmpmask)) +			break; +		cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); +		if (cpumask_empty(d->tmpmask)) +			continue; +		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), +				  GFP_KERNEL, num); +		if (!sg) { +			printk(KERN_WARNING +			       "Can not alloc domain group for node %d\n", j); +			return -ENOMEM; +		} +		sg->cpu_power = 0; +		cpumask_copy(sched_group_cpus(sg), d->tmpmask); +		sg->next = prev->next; +		cpumask_or(d->covered, d->covered, d->tmpmask); +		prev->next = sg; +		prev = sg; +	} +out: +	return 0; +}  #endif /* CONFIG_NUMA */  #ifdef CONFIG_NUMA @@ -8239,45 +6932,49 @@ static void free_sched_groups(const struct cpumask *cpu_map,   * there are asymmetries in the topology. If there are asymmetries, group   * having more cpu_power will pickup more load compared to the group having   * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain.   */  static void init_sched_groups_power(int cpu, struct sched_domain *sd)  {  	struct sched_domain *child;  	struct sched_group *group; +	long power; +	int weight;  	WARN_ON(!sd || !sd->groups);  	if (cpu != group_first_cpu(sd->groups))  		return; +	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); +  	child = sd->child; -	sd->groups->__cpu_power = 0; +	sd->groups->cpu_power = 0; -	/* -	 * For perf policy, if the groups in child domain share resources -	 * (for example cores sharing some portions of the cache hierarchy -	 * or SMT), then set this domain groups cpu_power such that each group -	 * can handle only one task, when there are other idle groups in the -	 * same sched domain. -	 */ -	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && -		       (child->flags & -			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { -		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); +	if (!child) { +		power = SCHED_LOAD_SCALE; +		weight = cpumask_weight(sched_domain_span(sd)); +		/* +		 * SMT siblings share the power of a single core. +		 * Usually multiple threads get a better yield out of +		 * that one core than a single thread would have, +		 * reflect that in sd->smt_gain. +		 */ +		if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { +			power *= sd->smt_gain; +			power /= weight; +			power >>= SCHED_LOAD_SHIFT; +		} +		sd->groups->cpu_power += power;  		return;  	}  	/* -	 * add cpu_power of each child group to this groups cpu_power +	 * Add cpu_power of each child group to this groups cpu_power.  	 */  	group = child->groups;  	do { -		sg_inc_cpu_power(sd->groups, group->__cpu_power); +		sd->groups->cpu_power += group->cpu_power;  		group = group->next;  	} while (group != child->groups);  } @@ -8315,6 +7012,9 @@ SD_INIT_FUNC(CPU)  #ifdef CONFIG_SCHED_MC   SD_INIT_FUNC(MC)  #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif  static int default_relax_domain_level = -1; @@ -8344,332 +7044,349 @@ static void set_domain_attribute(struct sched_domain *sd,  		request = attr->relax_domain_level;  	if (request < sd->level) {  		/* turn off idle balance on this domain */ -		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); +		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);  	} else {  		/* turn on idle balance on this domain */ -		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); +		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); +	} +} + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, +				 const struct cpumask *cpu_map) +{ +	switch (what) { +	case sa_sched_groups: +		free_sched_groups(cpu_map, d->tmpmask); /* fall through */ +		d->sched_group_nodes = NULL; +	case sa_rootdomain: +		free_rootdomain(d->rd); /* fall through */ +	case sa_tmpmask: +		free_cpumask_var(d->tmpmask); /* fall through */ +	case sa_send_covered: +		free_cpumask_var(d->send_covered); /* fall through */ +	case sa_this_book_map: +		free_cpumask_var(d->this_book_map); /* fall through */ +	case sa_this_core_map: +		free_cpumask_var(d->this_core_map); /* fall through */ +	case sa_this_sibling_map: +		free_cpumask_var(d->this_sibling_map); /* fall through */ +	case sa_nodemask: +		free_cpumask_var(d->nodemask); /* fall through */ +	case sa_sched_group_nodes: +#ifdef CONFIG_NUMA +		kfree(d->sched_group_nodes); /* fall through */ +	case sa_notcovered: +		free_cpumask_var(d->notcovered); /* fall through */ +	case sa_covered: +		free_cpumask_var(d->covered); /* fall through */ +	case sa_domainspan: +		free_cpumask_var(d->domainspan); /* fall through */ +#endif +	case sa_none: +		break;  	}  } -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int __build_sched_domains(const struct cpumask *cpu_map, -				 struct sched_domain_attr *attr) +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, +						   const struct cpumask *cpu_map)  { -	int i, err = -ENOMEM; -	struct root_domain *rd; -	cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, -		tmpmask;  #ifdef CONFIG_NUMA -	cpumask_var_t domainspan, covered, notcovered; -	struct sched_group **sched_group_nodes = NULL; -	int sd_allnodes = 0; - -	if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) -		goto out; -	if (!alloc_cpumask_var(&covered, GFP_KERNEL)) -		goto free_domainspan; -	if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) -		goto free_covered; -#endif - -	if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) -		goto free_notcovered; -	if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) -		goto free_nodemask; -	if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) -		goto free_this_sibling_map; -	if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) -		goto free_this_core_map; -	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) -		goto free_send_covered; - -#ifdef CONFIG_NUMA -	/* -	 * Allocate the per-node list of sched groups -	 */ -	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), -				    GFP_KERNEL); -	if (!sched_group_nodes) { +	if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) +		return sa_none; +	if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) +		return sa_domainspan; +	if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) +		return sa_covered; +	/* Allocate the per-node list of sched groups */ +	d->sched_group_nodes = kcalloc(nr_node_ids, +				      sizeof(struct sched_group *), GFP_KERNEL); +	if (!d->sched_group_nodes) {  		printk(KERN_WARNING "Can not alloc sched group node list\n"); -		goto free_tmpmask; -	} -#endif - -	rd = alloc_rootdomain(); -	if (!rd) { +		return sa_notcovered; +	} +	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; +#endif +	if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) +		return sa_sched_group_nodes; +	if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) +		return sa_nodemask; +	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) +		return sa_this_sibling_map; +	if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) +		return sa_this_core_map; +	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) +		return sa_this_book_map; +	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) +		return sa_send_covered; +	d->rd = alloc_rootdomain(); +	if (!d->rd) {  		printk(KERN_WARNING "Cannot alloc root domain\n"); -		goto free_sched_groups; +		return sa_tmpmask;  	} +	return sa_rootdomain; +} +static struct sched_domain *__build_numa_sched_domains(struct s_data *d, +	const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +{ +	struct sched_domain *sd = NULL;  #ifdef CONFIG_NUMA -	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; -#endif +	struct sched_domain *parent; -	/* -	 * Set up domains for cpus specified by the cpu_map. -	 */ -	for_each_cpu(i, cpu_map) { -		struct sched_domain *sd = NULL, *p; +	d->sd_allnodes = 0; +	if (cpumask_weight(cpu_map) > +	    SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { +		sd = &per_cpu(allnodes_domains, i).sd; +		SD_INIT(sd, ALLNODES); +		set_domain_attribute(sd, attr); +		cpumask_copy(sched_domain_span(sd), cpu_map); +		cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); +		d->sd_allnodes = 1; +	} +	parent = sd; -		cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); +	sd = &per_cpu(node_domains, i).sd; +	SD_INIT(sd, NODE); +	set_domain_attribute(sd, attr); +	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); +	sd->parent = parent; +	if (parent) +		parent->child = sd; +	cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); +#endif +	return sd; +} -#ifdef CONFIG_NUMA -		if (cpumask_weight(cpu_map) > -				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { -			sd = &per_cpu(allnodes_domains, i).sd; -			SD_INIT(sd, ALLNODES); -			set_domain_attribute(sd, attr); -			cpumask_copy(sched_domain_span(sd), cpu_map); -			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); -			p = sd; -			sd_allnodes = 1; -		} else -			p = NULL; +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, +	const struct cpumask *cpu_map, struct sched_domain_attr *attr, +	struct sched_domain *parent, int i) +{ +	struct sched_domain *sd; +	sd = &per_cpu(phys_domains, i).sd; +	SD_INIT(sd, CPU); +	set_domain_attribute(sd, attr); +	cpumask_copy(sched_domain_span(sd), d->nodemask); +	sd->parent = parent; +	if (parent) +		parent->child = sd; +	cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); +	return sd; +} -		sd = &per_cpu(node_domains, i).sd; -		SD_INIT(sd, NODE); -		set_domain_attribute(sd, attr); -		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); -		sd->parent = p; -		if (p) -			p->child = sd; -		cpumask_and(sched_domain_span(sd), -			    sched_domain_span(sd), cpu_map); +static struct sched_domain *__build_book_sched_domain(struct s_data *d, +	const struct cpumask *cpu_map, struct sched_domain_attr *attr, +	struct sched_domain *parent, int i) +{ +	struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK +	sd = &per_cpu(book_domains, i).sd; +	SD_INIT(sd, BOOK); +	set_domain_attribute(sd, attr); +	cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); +	sd->parent = parent; +	parent->child = sd; +	cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);  #endif +	return sd; +} -		p = sd; -		sd = &per_cpu(phys_domains, i).sd; -		SD_INIT(sd, CPU); -		set_domain_attribute(sd, attr); -		cpumask_copy(sched_domain_span(sd), nodemask); -		sd->parent = p; -		if (p) -			p->child = sd; -		cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); - +static struct sched_domain *__build_mc_sched_domain(struct s_data *d, +	const struct cpumask *cpu_map, struct sched_domain_attr *attr, +	struct sched_domain *parent, int i) +{ +	struct sched_domain *sd = parent;  #ifdef CONFIG_SCHED_MC -		p = sd; -		sd = &per_cpu(core_domains, i).sd; -		SD_INIT(sd, MC); -		set_domain_attribute(sd, attr); -		cpumask_and(sched_domain_span(sd), cpu_map, -						   cpu_coregroup_mask(i)); -		sd->parent = p; -		p->child = sd; -		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); +	sd = &per_cpu(core_domains, i).sd; +	SD_INIT(sd, MC); +	set_domain_attribute(sd, attr); +	cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); +	sd->parent = parent; +	parent->child = sd; +	cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);  #endif +	return sd; +} +static struct sched_domain *__build_smt_sched_domain(struct s_data *d, +	const struct cpumask *cpu_map, struct sched_domain_attr *attr, +	struct sched_domain *parent, int i) +{ +	struct sched_domain *sd = parent;  #ifdef CONFIG_SCHED_SMT -		p = sd; -		sd = &per_cpu(cpu_domains, i).sd; -		SD_INIT(sd, SIBLING); -		set_domain_attribute(sd, attr); -		cpumask_and(sched_domain_span(sd), -			    topology_thread_cpumask(i), cpu_map); -		sd->parent = p; -		p->child = sd; -		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); +	sd = &per_cpu(cpu_domains, i).sd; +	SD_INIT(sd, SIBLING); +	set_domain_attribute(sd, attr); +	cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); +	sd->parent = parent; +	parent->child = sd; +	cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);  #endif -	} +	return sd; +} +static void build_sched_groups(struct s_data *d, enum sched_domain_level l, +			       const struct cpumask *cpu_map, int cpu) +{ +	switch (l) {  #ifdef CONFIG_SCHED_SMT -	/* Set up CPU (sibling) groups */ -	for_each_cpu(i, cpu_map) { -		cpumask_and(this_sibling_map, -			    topology_thread_cpumask(i), cpu_map); -		if (i != cpumask_first(this_sibling_map)) -			continue; - -		init_sched_build_groups(this_sibling_map, cpu_map, -					&cpu_to_cpu_group, -					send_covered, tmpmask); -	} +	case SD_LV_SIBLING: /* set up CPU (sibling) groups */ +		cpumask_and(d->this_sibling_map, cpu_map, +			    topology_thread_cpumask(cpu)); +		if (cpu == cpumask_first(d->this_sibling_map)) +			init_sched_build_groups(d->this_sibling_map, cpu_map, +						&cpu_to_cpu_group, +						d->send_covered, d->tmpmask); +		break;  #endif -  #ifdef CONFIG_SCHED_MC -	/* Set up multi-core groups */ -	for_each_cpu(i, cpu_map) { -		cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); -		if (i != cpumask_first(this_core_map)) -			continue; - -		init_sched_build_groups(this_core_map, cpu_map, -					&cpu_to_core_group, -					send_covered, tmpmask); -	} +	case SD_LV_MC: /* set up multi-core groups */ +		cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); +		if (cpu == cpumask_first(d->this_core_map)) +			init_sched_build_groups(d->this_core_map, cpu_map, +						&cpu_to_core_group, +						d->send_covered, d->tmpmask); +		break;  #endif - -	/* Set up physical groups */ -	for (i = 0; i < nr_node_ids; i++) { -		cpumask_and(nodemask, cpumask_of_node(i), cpu_map); -		if (cpumask_empty(nodemask)) -			continue; - -		init_sched_build_groups(nodemask, cpu_map, -					&cpu_to_phys_group, -					send_covered, tmpmask); -	} - +#ifdef CONFIG_SCHED_BOOK +	case SD_LV_BOOK: /* set up book groups */ +		cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); +		if (cpu == cpumask_first(d->this_book_map)) +			init_sched_build_groups(d->this_book_map, cpu_map, +						&cpu_to_book_group, +						d->send_covered, d->tmpmask); +		break; +#endif +	case SD_LV_CPU: /* set up physical groups */ +		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); +		if (!cpumask_empty(d->nodemask)) +			init_sched_build_groups(d->nodemask, cpu_map, +						&cpu_to_phys_group, +						d->send_covered, d->tmpmask); +		break;  #ifdef CONFIG_NUMA -	/* Set up node groups */ -	if (sd_allnodes) { -		init_sched_build_groups(cpu_map, cpu_map, -					&cpu_to_allnodes_group, -					send_covered, tmpmask); +	case SD_LV_ALLNODES: +		init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, +					d->send_covered, d->tmpmask); +		break; +#endif +	default: +		break;  	} +} -	for (i = 0; i < nr_node_ids; i++) { -		/* Set up node groups */ -		struct sched_group *sg, *prev; -		int j; - -		cpumask_clear(covered); -		cpumask_and(nodemask, cpumask_of_node(i), cpu_map); -		if (cpumask_empty(nodemask)) { -			sched_group_nodes[i] = NULL; -			continue; -		} +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int __build_sched_domains(const struct cpumask *cpu_map, +				 struct sched_domain_attr *attr) +{ +	enum s_alloc alloc_state = sa_none; +	struct s_data d; +	struct sched_domain *sd; +	int i; +#ifdef CONFIG_NUMA +	d.sd_allnodes = 0; +#endif -		sched_domain_node_span(i, domainspan); -		cpumask_and(domainspan, domainspan, cpu_map); +	alloc_state = __visit_domain_allocation_hell(&d, cpu_map); +	if (alloc_state != sa_rootdomain) +		goto error; +	alloc_state = sa_sched_groups; -		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), -				  GFP_KERNEL, i); -		if (!sg) { -			printk(KERN_WARNING "Can not alloc domain group for " -				"node %d\n", i); -			goto error; -		} -		sched_group_nodes[i] = sg; -		for_each_cpu(j, nodemask) { -			struct sched_domain *sd; +	/* +	 * Set up domains for cpus specified by the cpu_map. +	 */ +	for_each_cpu(i, cpu_map) { +		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), +			    cpu_map); -			sd = &per_cpu(node_domains, j).sd; -			sd->groups = sg; -		} -		sg->__cpu_power = 0; -		cpumask_copy(sched_group_cpus(sg), nodemask); -		sg->next = sg; -		cpumask_or(covered, covered, nodemask); -		prev = sg; +		sd = __build_numa_sched_domains(&d, cpu_map, attr, i); +		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); +		sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); +		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); +		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); +	} -		for (j = 0; j < nr_node_ids; j++) { -			int n = (i + j) % nr_node_ids; +	for_each_cpu(i, cpu_map) { +		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); +		build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); +		build_sched_groups(&d, SD_LV_MC, cpu_map, i); +	} -			cpumask_complement(notcovered, covered); -			cpumask_and(tmpmask, notcovered, cpu_map); -			cpumask_and(tmpmask, tmpmask, domainspan); -			if (cpumask_empty(tmpmask)) -				break; +	/* Set up physical groups */ +	for (i = 0; i < nr_node_ids; i++) +		build_sched_groups(&d, SD_LV_CPU, cpu_map, i); -			cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); -			if (cpumask_empty(tmpmask)) -				continue; +#ifdef CONFIG_NUMA +	/* Set up node groups */ +	if (d.sd_allnodes) +		build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); -			sg = kmalloc_node(sizeof(struct sched_group) + -					  cpumask_size(), -					  GFP_KERNEL, i); -			if (!sg) { -				printk(KERN_WARNING -				"Can not alloc domain group for node %d\n", j); -				goto error; -			} -			sg->__cpu_power = 0; -			cpumask_copy(sched_group_cpus(sg), tmpmask); -			sg->next = prev->next; -			cpumask_or(covered, covered, tmpmask); -			prev->next = sg; -			prev = sg; -		} -	} +	for (i = 0; i < nr_node_ids; i++) +		if (build_numa_sched_groups(&d, cpu_map, i)) +			goto error;  #endif  	/* Calculate CPU power for physical packages and nodes */  #ifdef CONFIG_SCHED_SMT  	for_each_cpu(i, cpu_map) { -		struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - +		sd = &per_cpu(cpu_domains, i).sd;  		init_sched_groups_power(i, sd);  	}  #endif  #ifdef CONFIG_SCHED_MC  	for_each_cpu(i, cpu_map) { -		struct sched_domain *sd = &per_cpu(core_domains, i).sd; - +		sd = &per_cpu(core_domains, i).sd;  		init_sched_groups_power(i, sd);  	}  #endif - +#ifdef CONFIG_SCHED_BOOK  	for_each_cpu(i, cpu_map) { -		struct sched_domain *sd = &per_cpu(phys_domains, i).sd; +		sd = &per_cpu(book_domains, i).sd; +		init_sched_groups_power(i, sd); +	} +#endif +	for_each_cpu(i, cpu_map) { +		sd = &per_cpu(phys_domains, i).sd;  		init_sched_groups_power(i, sd);  	}  #ifdef CONFIG_NUMA  	for (i = 0; i < nr_node_ids; i++) -		init_numa_sched_groups_power(sched_group_nodes[i]); +		init_numa_sched_groups_power(d.sched_group_nodes[i]); -	if (sd_allnodes) { +	if (d.sd_allnodes) {  		struct sched_group *sg;  		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, -								tmpmask); +								d.tmpmask);  		init_numa_sched_groups_power(sg);  	}  #endif  	/* Attach the domains */  	for_each_cpu(i, cpu_map) { -		struct sched_domain *sd;  #ifdef CONFIG_SCHED_SMT  		sd = &per_cpu(cpu_domains, i).sd;  #elif defined(CONFIG_SCHED_MC)  		sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) +		sd = &per_cpu(book_domains, i).sd;  #else  		sd = &per_cpu(phys_domains, i).sd;  #endif -		cpu_attach_domain(sd, rd, i); +		cpu_attach_domain(sd, d.rd, i);  	} -	err = 0; - -free_tmpmask: -	free_cpumask_var(tmpmask); -free_send_covered: -	free_cpumask_var(send_covered); -free_this_core_map: -	free_cpumask_var(this_core_map); -free_this_sibling_map: -	free_cpumask_var(this_sibling_map); -free_nodemask: -	free_cpumask_var(nodemask); -free_notcovered: -#ifdef CONFIG_NUMA -	free_cpumask_var(notcovered); -free_covered: -	free_cpumask_var(covered); -free_domainspan: -	free_cpumask_var(domainspan); -out: -#endif -	return err; - -free_sched_groups: -#ifdef CONFIG_NUMA -	kfree(sched_group_nodes); -#endif -	goto free_tmpmask; +	d.sched_group_nodes = NULL; /* don't free this we still need it */ +	__free_domain_allocs(&d, sa_tmpmask, cpu_map); +	return 0; -#ifdef CONFIG_NUMA  error: -	free_sched_groups(cpu_map, tmpmask); -	free_rootdomain(rd); -	goto free_tmpmask; -#endif +	__free_domain_allocs(&d, alloc_state, cpu_map); +	return -ENOMEM;  }  static int build_sched_domains(const struct cpumask *cpu_map) @@ -8677,7 +7394,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)  	return __build_sched_domains(cpu_map, NULL);  } -static struct cpumask *doms_cur;	/* current sched domains */ +static cpumask_var_t *doms_cur;	/* current sched domains */  static int ndoms_cur;		/* number of sched domains in 'doms_cur' */  static struct sched_domain_attr *dattr_cur;  				/* attribues of custom domains in 'doms_cur' */ @@ -8699,6 +7416,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)  	return 0;  } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ +	int i; +	cpumask_var_t *doms; + +	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); +	if (!doms) +		return NULL; +	for (i = 0; i < ndoms; i++) { +		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { +			free_sched_domains(doms, i); +			return NULL; +		} +	} +	return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ +	unsigned int i; +	for (i = 0; i < ndoms; i++) +		free_cpumask_var(doms[i]); +	kfree(doms); +} +  /*   * Set up scheduler domains and groups. Callers must hold the hotplug lock.   * For now this just excludes isolated cpus, but could be used to @@ -8710,12 +7452,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)  	arch_update_cpu_topology();  	ndoms_cur = 1; -	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); +	doms_cur = alloc_sched_domains(ndoms_cur);  	if (!doms_cur) -		doms_cur = fallback_doms; -	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); +		doms_cur = &fallback_doms; +	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);  	dattr_cur = NULL; -	err = build_sched_domains(doms_cur); +	err = build_sched_domains(doms_cur[0]);  	register_sched_domain_sysctl();  	return err; @@ -8765,19 +7507,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,   * doms_new[] to the current sched domain partitioning, doms_cur[].   * It destroys each deleted domain and builds each new domain.   * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.   * The masks don't intersect (don't overlap.) We should setup one   * sched domain for each mask. CPUs not in any of the cpumasks will   * not be load balanced. If the same cpumask appears both in the   * current 'doms_cur' domains and in the new 'doms_new', we can leave   * it as it is.   * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains.  This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt.   *   * If doms_new == NULL it will be replaced with cpu_online_mask.   * ndoms_new == 0 is a special case for destroying existing domains, @@ -8785,8 +7527,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,   *   * Call with hotplug lock held   */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],  			     struct sched_domain_attr *dattr_new)  {  	int i, j, n; @@ -8805,40 +7546,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,  	/* Destroy deleted domains */  	for (i = 0; i < ndoms_cur; i++) {  		for (j = 0; j < n && !new_topology; j++) { -			if (cpumask_equal(&doms_cur[i], &doms_new[j]) +			if (cpumask_equal(doms_cur[i], doms_new[j])  			    && dattrs_equal(dattr_cur, i, dattr_new, j))  				goto match1;  		}  		/* no match - a current sched domain not in new doms_new[] */ -		detach_destroy_domains(doms_cur + i); +		detach_destroy_domains(doms_cur[i]);  match1:  		;  	}  	if (doms_new == NULL) {  		ndoms_cur = 0; -		doms_new = fallback_doms; -		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); +		doms_new = &fallback_doms; +		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);  		WARN_ON_ONCE(dattr_new);  	}  	/* Build new domains */  	for (i = 0; i < ndoms_new; i++) {  		for (j = 0; j < ndoms_cur && !new_topology; j++) { -			if (cpumask_equal(&doms_new[i], &doms_cur[j]) +			if (cpumask_equal(doms_new[i], doms_cur[j])  			    && dattrs_equal(dattr_new, i, dattr_cur, j))  				goto match2;  		}  		/* no match - add a new doms_new */ -		__build_sched_domains(doms_new + i, +		__build_sched_domains(doms_new[i],  					dattr_new ? dattr_new + i : NULL);  match2:  		;  	}  	/* Remember the new sched domains */ -	if (doms_cur != fallback_doms) -		kfree(doms_cur); +	if (doms_cur != &fallback_doms) +		free_sched_domains(doms_cur, ndoms_cur);  	kfree(dattr_cur);	/* kfree(NULL) is safe */  	doms_cur = doms_new;  	dattr_cur = dattr_new; @@ -8890,11 +7631,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)  #ifdef CONFIG_SCHED_MC  static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, +					   struct sysdev_class_attribute *attr,  					   char *page)  {  	return sprintf(page, "%u\n", sched_mc_power_savings);  }  static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, +					    struct sysdev_class_attribute *attr,  					    const char *buf, size_t count)  {  	return sched_power_savings_store(buf, count, 0); @@ -8906,11 +7649,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,  #ifdef CONFIG_SCHED_SMT  static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, +					    struct sysdev_class_attribute *attr,  					    char *page)  {  	return sprintf(page, "%u\n", sched_smt_power_savings);  }  static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, +					     struct sysdev_class_attribute *attr,  					     const char *buf, size_t count)  {  	return sched_power_savings_store(buf, count, 1); @@ -8938,27 +7683,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)  }  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS  /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask.  If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains().   */ -static int update_sched_domains(struct notifier_block *nfb, -				unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, +			     void *hcpu)  { -	switch (action) { +	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN: -	case CPU_DEAD: -	case CPU_DEAD_FROZEN: -		partition_sched_domains(1, NULL, NULL); +	case CPU_DOWN_FAILED: +		cpuset_update_active_cpus();  		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, +			       void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		cpuset_update_active_cpus(); +		return NOTIFY_OK;  	default:  		return NOTIFY_DONE;  	}  } -#endif  static int update_runtime(struct notifier_block *nfb,  				unsigned long action, void *hcpu) @@ -8988,6 +7741,7 @@ void __init sched_init_smp(void)  	cpumask_var_t non_isolated_cpus;  	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); +	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);  #if defined(CONFIG_NUMA)  	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -8996,17 +7750,15 @@ void __init sched_init_smp(void)  #endif  	get_online_cpus();  	mutex_lock(&sched_domains_mutex); -	arch_init_sched_domains(cpu_online_mask); +	arch_init_sched_domains(cpu_active_mask);  	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);  	if (cpumask_empty(non_isolated_cpus))  		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);  	mutex_unlock(&sched_domains_mutex);  	put_online_cpus(); -#ifndef CONFIG_CPUSETS -	/* XXX: Theoretical race here - CPU may be hotplugged now */ -	hotcpu_notifier(update_sched_domains, 0); -#endif +	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); +	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);  	/* RT runtime code needs to handle some hotplug events */  	hotcpu_notifier(update_runtime, 0); @@ -9019,7 +7771,6 @@ void __init sched_init_smp(void)  	sched_init_granularity();  	free_cpumask_var(non_isolated_cpus); -	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);  	init_sched_rt_class();  }  #else @@ -9070,13 +7821,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  #ifdef CONFIG_SMP  	rt_rq->rt_nr_migratory = 0;  	rt_rq->overloaded = 0; -	plist_head_init(&rq->rt.pushable_tasks, &rq->lock); +	plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);  #endif  	rt_rq->rt_time = 0;  	rt_rq->rt_throttled = 0;  	rt_rq->rt_runtime = 0; -	spin_lock_init(&rt_rq->rt_runtime_lock); +	raw_spin_lock_init(&rt_rq->rt_runtime_lock);  #ifdef CONFIG_RT_GROUP_SCHED  	rt_rq->rt_nr_boosted = 0; @@ -9086,18 +7837,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)  #ifdef CONFIG_FAIR_GROUP_SCHED  static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, -				struct sched_entity *se, int cpu, int add, +				struct sched_entity *se, int cpu,  				struct sched_entity *parent)  {  	struct rq *rq = cpu_rq(cpu);  	tg->cfs_rq[cpu] = cfs_rq;  	init_cfs_rq(cfs_rq, rq);  	cfs_rq->tg = tg; -	if (add) -		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);  	tg->se[cpu] = se; -	/* se could be NULL for init_task_group */ +	/* se could be NULL for root_task_group */  	if (!se)  		return; @@ -9107,15 +7856,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  		se->cfs_rq = parent->my_q;  	se->my_q = cfs_rq; -	se->load.weight = tg->shares; -	se->load.inv_weight = 0; +	update_load_set(&se->load, 0);  	se->parent = parent;  }  #endif  #ifdef CONFIG_RT_GROUP_SCHED  static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, -		struct sched_rt_entity *rt_se, int cpu, int add, +		struct sched_rt_entity *rt_se, int cpu,  		struct sched_rt_entity *parent)  {  	struct rq *rq = cpu_rq(cpu); @@ -9123,10 +7871,7 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,  	tg->rt_rq[cpu] = rt_rq;  	init_rt_rq(rt_rq, rq);  	rt_rq->tg = tg; -	rt_rq->rt_se = rt_se;  	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; -	if (add) -		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);  	tg->rt_se[cpu] = rt_se;  	if (!rt_se) @@ -9154,48 +7899,27 @@ void __init sched_init(void)  #ifdef CONFIG_RT_GROUP_SCHED  	alloc_size += 2 * nr_cpu_ids * sizeof(void **);  #endif -#ifdef CONFIG_USER_SCHED -	alloc_size *= 2; -#endif  #ifdef CONFIG_CPUMASK_OFFSTACK  	alloc_size += num_possible_cpus() * cpumask_size();  #endif -	/* -	 * As sched_init() is called before page_alloc is setup, -	 * we use alloc_bootmem(). -	 */  	if (alloc_size) {  		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);  #ifdef CONFIG_FAIR_GROUP_SCHED -		init_task_group.se = (struct sched_entity **)ptr; -		ptr += nr_cpu_ids * sizeof(void **); - -		init_task_group.cfs_rq = (struct cfs_rq **)ptr; -		ptr += nr_cpu_ids * sizeof(void **); - -#ifdef CONFIG_USER_SCHED  		root_task_group.se = (struct sched_entity **)ptr;  		ptr += nr_cpu_ids * sizeof(void **);  		root_task_group.cfs_rq = (struct cfs_rq **)ptr;  		ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ +  #endif /* CONFIG_FAIR_GROUP_SCHED */  #ifdef CONFIG_RT_GROUP_SCHED -		init_task_group.rt_se = (struct sched_rt_entity **)ptr; -		ptr += nr_cpu_ids * sizeof(void **); - -		init_task_group.rt_rq = (struct rt_rq **)ptr; -		ptr += nr_cpu_ids * sizeof(void **); - -#ifdef CONFIG_USER_SCHED  		root_task_group.rt_se = (struct sched_rt_entity **)ptr;  		ptr += nr_cpu_ids * sizeof(void **);  		root_task_group.rt_rq = (struct rt_rq **)ptr;  		ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ +  #endif /* CONFIG_RT_GROUP_SCHED */  #ifdef CONFIG_CPUMASK_OFFSTACK  		for_each_possible_cpu(i) { @@ -9213,108 +7937,79 @@ void __init sched_init(void)  			global_rt_period(), global_rt_runtime());  #ifdef CONFIG_RT_GROUP_SCHED -	init_rt_bandwidth(&init_task_group.rt_bandwidth, -			global_rt_period(), global_rt_runtime()); -#ifdef CONFIG_USER_SCHED  	init_rt_bandwidth(&root_task_group.rt_bandwidth, -			global_rt_period(), RUNTIME_INF); -#endif /* CONFIG_USER_SCHED */ +			global_rt_period(), global_rt_runtime());  #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_GROUP_SCHED -	list_add(&init_task_group.list, &task_groups); -	INIT_LIST_HEAD(&init_task_group.children); - -#ifdef CONFIG_USER_SCHED +#ifdef CONFIG_CGROUP_SCHED +	list_add(&root_task_group.list, &task_groups);  	INIT_LIST_HEAD(&root_task_group.children); -	init_task_group.parent = &root_task_group; -	list_add(&init_task_group.siblings, &root_task_group.children); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_GROUP_SCHED */ +	autogroup_init(&init_task); +#endif /* CONFIG_CGROUP_SCHED */  	for_each_possible_cpu(i) {  		struct rq *rq;  		rq = cpu_rq(i); -		spin_lock_init(&rq->lock); +		raw_spin_lock_init(&rq->lock);  		rq->nr_running = 0;  		rq->calc_load_active = 0;  		rq->calc_load_update = jiffies + LOAD_FREQ;  		init_cfs_rq(&rq->cfs, rq);  		init_rt_rq(&rq->rt, rq);  #ifdef CONFIG_FAIR_GROUP_SCHED -		init_task_group.shares = init_task_group_load; +		root_task_group.shares = root_task_group_load;  		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED  		/* -		 * How much cpu bandwidth does init_task_group get? +		 * How much cpu bandwidth does root_task_group get?  		 *  		 * In case of task-groups formed thr' the cgroup filesystem, it  		 * gets 100% of the cpu resources in the system. This overall  		 * system cpu resource is divided among the tasks of -		 * init_task_group and its child task-groups in a fair manner, +		 * root_task_group and its child task-groups in a fair manner,  		 * based on each entity's (task or task-group's) weight  		 * (se->load.weight).  		 * -		 * In other words, if init_task_group has 10 tasks of weight +		 * In other words, if root_task_group has 10 tasks of weight  		 * 1024) and two child groups A0 and A1 (of weight 1024 each),  		 * then A0's share of the cpu resource is:  		 *  		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%  		 * -		 * We achieve this by letting init_task_group's tasks sit -		 * directly in rq->cfs (i.e init_task_group->se[] = NULL). +		 * We achieve this by letting root_task_group's tasks sit +		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).  		 */ -		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED -		root_task_group.shares = NICE_0_LOAD; -		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); -		/* -		 * In case of task-groups formed thr' the user id of tasks, -		 * init_task_group represents tasks belonging to root user. -		 * Hence it forms a sibling of all subsequent groups formed. -		 * In this case, init_task_group gets only a fraction of overall -		 * system cpu resource, based on the weight assigned to root -		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished -		 * by letting tasks of init_task_group sit in a separate cfs_rq -		 * (init_cfs_rq) and having one entity represent this group of -		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). -		 */ -		init_tg_cfs_entry(&init_task_group, -				&per_cpu(init_cfs_rq, i), -				&per_cpu(init_sched_entity, i), i, 1, -				root_task_group.se[i]); - -#endif +		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);  #endif /* CONFIG_FAIR_GROUP_SCHED */  		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;  #ifdef CONFIG_RT_GROUP_SCHED  		INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED -		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED -		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); -		init_tg_rt_entry(&init_task_group, -				&per_cpu(init_rt_rq, i), -				&per_cpu(init_sched_rt_entity, i), i, 1, -				root_task_group.rt_se[i]); -#endif +		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);  #endif  		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)  			rq->cpu_load[j] = 0; + +		rq->last_load_update_tick = jiffies; +  #ifdef CONFIG_SMP  		rq->sd = NULL;  		rq->rd = NULL; +		rq->cpu_power = SCHED_LOAD_SCALE; +		rq->post_schedule = 0;  		rq->active_balance = 0;  		rq->next_balance = jiffies;  		rq->push_cpu = 0;  		rq->cpu = i;  		rq->online = 0; -		rq->migration_thread = NULL; -		INIT_LIST_HEAD(&rq->migration_queue); +		rq->idle_stamp = 0; +		rq->avg_idle = 2*sysctl_sched_migration_cost;  		rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ +		rq->nohz_balance_kick = 0; +		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif  #endif  		init_rq_hrtick(rq);  		atomic_set(&rq->nr_iowait, 0); @@ -9331,7 +8026,7 @@ void __init sched_init(void)  #endif  #ifdef CONFIG_RT_MUTEXES -	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +	plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);  #endif  	/* @@ -9356,28 +8051,38 @@ void __init sched_init(void)  	current->sched_class = &fair_sched_class;  	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ -	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); +	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);  #ifdef CONFIG_SMP  #ifdef CONFIG_NO_HZ -	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); -	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); -#endif -	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); +	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); +	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); +	atomic_set(&nohz.load_balancer, nr_cpu_ids); +	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); +	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); +#endif +	/* May be allocated at isolcpus cmdline parse time */ +	if (cpu_isolated_map == NULL) +		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  #endif /* SMP */ -	perf_counter_init(); -  	scheduler_running = 1;  }  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ +	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + +	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(const char *file, int line, int preempt_offset)  {  #ifdef in_atomic  	static unsigned long prev_jiffy;	/* ratelimiting */ -	if ((!in_atomic() && !irqs_disabled()) || -		    system_state != SYSTEM_RUNNING || oops_in_progress) +	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || +	    system_state != SYSTEM_RUNNING || oops_in_progress)  		return;  	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)  		return; @@ -9405,7 +8110,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)  {  	int on_rq; -	update_rq_clock(rq);  	on_rq = p->se.on_rq;  	if (on_rq)  		deactivate_task(rq, p, 0); @@ -9432,9 +8136,9 @@ void normalize_rt_tasks(void)  		p->se.exec_start		= 0;  #ifdef CONFIG_SCHEDSTATS -		p->se.wait_start		= 0; -		p->se.sleep_start		= 0; -		p->se.block_start		= 0; +		p->se.statistics.wait_start	= 0; +		p->se.statistics.sleep_start	= 0; +		p->se.statistics.block_start	= 0;  #endif  		if (!rt_task(p)) { @@ -9447,13 +8151,13 @@ void normalize_rt_tasks(void)  			continue;  		} -		spin_lock(&p->pi_lock); +		raw_spin_lock(&p->pi_lock);  		rq = __task_rq_lock(p);  		normalize_task(rq, p);  		__task_rq_unlock(rq); -		spin_unlock(&p->pi_lock); +		raw_spin_unlock(&p->pi_lock);  	} while_each_thread(g, p);  	read_unlock_irqrestore(&tasklist_lock, flags); @@ -9461,9 +8165,9 @@ void normalize_rt_tasks(void)  #endif /* CONFIG_MAGIC_SYSRQ */ -#ifdef CONFIG_IA64 +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)  /* - * These functions are only useful for the IA64 MCA handling. + * These functions are only useful for the IA64 MCA handling, or kdb.   *   * They can only be called when the whole system has been   * stopped - every CPU needs to be quiescent, and no scheduling @@ -9483,6 +8187,9 @@ struct task_struct *curr_task(int cpu)  	return cpu_curr(cpu);  } +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64  /**   * set_curr_task - set the current task for a given cpu.   * @cpu: the processor in question. @@ -9549,26 +8256,34 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  		se = kzalloc_node(sizeof(struct sched_entity),  				  GFP_KERNEL, cpu_to_node(i));  		if (!se) -			goto err; +			goto err_free_rq; -		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); +		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);  	}  	return 1; - err: +err_free_rq: +	kfree(cfs_rq); +err:  	return 0;  } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -	list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, -			&cpu_rq(cpu)->leaf_cfs_rq_list); -} -  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)  { -	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	/* +	* Only empty task groups can be destroyed; so we can speculatively +	* check on_list without danger of it being re-added. +	*/ +	if (!tg->cfs_rq[cpu]->on_list) +		return; + +	raw_spin_lock_irqsave(&rq->lock, flags); +	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); +	raw_spin_unlock_irqrestore(&rq->lock, flags);  }  #else /* !CONFG_FAIR_GROUP_SCHED */  static inline void free_fair_sched_group(struct task_group *tg) @@ -9581,10 +8296,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	return 1;  } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} -  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)  {  } @@ -9637,27 +8348,18 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)  		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),  				     GFP_KERNEL, cpu_to_node(i));  		if (!rt_se) -			goto err; +			goto err_free_rq; -		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); +		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);  	}  	return 1; - err: +err_free_rq: +	kfree(rt_rq); +err:  	return 0;  } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -	list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, -			&cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -}  #else /* !CONFIG_RT_GROUP_SCHED */  static inline void free_rt_sched_group(struct task_group *tg)  { @@ -9668,21 +8370,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)  {  	return 1;  } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -}  #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED  static void free_sched_group(struct task_group *tg)  {  	free_fair_sched_group(tg);  	free_rt_sched_group(tg); +	autogroup_free(tg);  	kfree(tg);  } @@ -9691,7 +8386,6 @@ struct task_group *sched_create_group(struct task_group *parent)  {  	struct task_group *tg;  	unsigned long flags; -	int i;  	tg = kzalloc(sizeof(*tg), GFP_KERNEL);  	if (!tg) @@ -9704,10 +8398,6 @@ struct task_group *sched_create_group(struct task_group *parent)  		goto err;  	spin_lock_irqsave(&task_group_lock, flags); -	for_each_possible_cpu(i) { -		register_fair_sched_group(tg, i); -		register_rt_sched_group(tg, i); -	}  	list_add_rcu(&tg->list, &task_groups);  	WARN_ON(!parent); /* root should already exist */ @@ -9737,11 +8427,11 @@ void sched_destroy_group(struct task_group *tg)  	unsigned long flags;  	int i; -	spin_lock_irqsave(&task_group_lock, flags); -	for_each_possible_cpu(i) { +	/* end participation in shares distribution */ +	for_each_possible_cpu(i)  		unregister_fair_sched_group(tg, i); -		unregister_rt_sched_group(tg, i); -	} + +	spin_lock_irqsave(&task_group_lock, flags);  	list_del_rcu(&tg->list);  	list_del_rcu(&tg->siblings);  	spin_unlock_irqrestore(&task_group_lock, flags); @@ -9763,8 +8453,6 @@ void sched_move_task(struct task_struct *tsk)  	rq = task_rq_lock(tsk, &flags); -	update_rq_clock(rq); -  	running = task_current(rq, tsk);  	on_rq = tsk->se.on_rq; @@ -9773,12 +8461,12 @@ void sched_move_task(struct task_struct *tsk)  	if (unlikely(running))  		tsk->sched_class->put_prev_task(rq, tsk); -	set_task_rq(tsk, task_cpu(tsk)); -  #ifdef CONFIG_FAIR_GROUP_SCHED -	if (tsk->sched_class->moved_group) -		tsk->sched_class->moved_group(tsk); +	if (tsk->sched_class->task_move_group) +		tsk->sched_class->task_move_group(tsk, on_rq); +	else  #endif +		set_task_rq(tsk, task_cpu(tsk));  	if (unlikely(running))  		tsk->sched_class->set_curr_task(rq); @@ -9787,36 +8475,9 @@ void sched_move_task(struct task_struct *tsk)  	task_rq_unlock(rq, &flags);  } -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */  #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ -	struct cfs_rq *cfs_rq = se->cfs_rq; -	int on_rq; - -	on_rq = se->on_rq; -	if (on_rq) -		dequeue_entity(cfs_rq, se, 0); - -	se->load.weight = shares; -	se->load.inv_weight = 0; - -	if (on_rq) -		enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ -	struct cfs_rq *cfs_rq = se->cfs_rq; -	struct rq *rq = cfs_rq->rq; -	unsigned long flags; - -	spin_lock_irqsave(&rq->lock, flags); -	__set_se_shares(se, shares); -	spin_unlock_irqrestore(&rq->lock, flags); -} -  static DEFINE_MUTEX(shares_mutex);  int sched_group_set_shares(struct task_group *tg, unsigned long shares) @@ -9839,37 +8500,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  	if (tg->shares == shares)  		goto done; -	spin_lock_irqsave(&task_group_lock, flags); -	for_each_possible_cpu(i) -		unregister_fair_sched_group(tg, i); -	list_del_rcu(&tg->siblings); -	spin_unlock_irqrestore(&task_group_lock, flags); - -	/* wait for any ongoing reference to this group to finish */ -	synchronize_sched(); - -	/* -	 * Now we are free to modify the group's share on each cpu -	 * w/o tripping rebalance_share or load_balance_fair. -	 */  	tg->shares = shares;  	for_each_possible_cpu(i) { -		/* -		 * force a rebalance -		 */ -		cfs_rq_set_shares(tg->cfs_rq[i], 0); -		set_se_shares(tg->se[i], shares); +		struct rq *rq = cpu_rq(i); +		struct sched_entity *se; + +		se = tg->se[i]; +		/* Propagate contribution to hierarchy */ +		raw_spin_lock_irqsave(&rq->lock, flags); +		for_each_sched_entity(se) +			update_cfs_shares(group_cfs_rq(se), 0); +		raw_spin_unlock_irqrestore(&rq->lock, flags);  	} -	/* -	 * Enable load balance activity on this group, by inserting it back on -	 * each cpu's rq->leaf_cfs_rq_list. -	 */ -	spin_lock_irqsave(&task_group_lock, flags); -	for_each_possible_cpu(i) -		register_fair_sched_group(tg, i); -	list_add_rcu(&tg->siblings, &tg->parent->children); -	spin_unlock_irqrestore(&task_group_lock, flags);  done:  	mutex_unlock(&shares_mutex);  	return 0; @@ -9929,13 +8572,6 @@ static int tg_schedulable(struct task_group *tg, void *data)  		runtime = d->rt_runtime;  	} -#ifdef CONFIG_USER_SCHED -	if (tg == &root_task_group) { -		period = global_rt_period(); -		runtime = global_rt_runtime(); -	} -#endif -  	/*  	 * Cannot have more runtime than the period.  	 */ @@ -9999,19 +8635,19 @@ static int tg_set_bandwidth(struct task_group *tg,  	if (err)  		goto unlock; -	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); +	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);  	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);  	tg->rt_bandwidth.rt_runtime = rt_runtime;  	for_each_possible_cpu(i) {  		struct rt_rq *rt_rq = tg->rt_rq[i]; -		spin_lock(&rt_rq->rt_runtime_lock); +		raw_spin_lock(&rt_rq->rt_runtime_lock);  		rt_rq->rt_runtime = rt_runtime; -		spin_unlock(&rt_rq->rt_runtime_lock); +		raw_spin_unlock(&rt_rq->rt_runtime_lock);  	} -	spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock:  	read_unlock(&tasklist_lock);  	mutex_unlock(&rt_constraints_mutex); @@ -10115,22 +8751,22 @@ static int sched_rt_global_constraints(void)  	if (sysctl_sched_rt_runtime == 0)  		return -EBUSY; -	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);  	for_each_possible_cpu(i) {  		struct rt_rq *rt_rq = &cpu_rq(i)->rt; -		spin_lock(&rt_rq->rt_runtime_lock); +		raw_spin_lock(&rt_rq->rt_runtime_lock);  		rt_rq->rt_runtime = global_rt_runtime(); -		spin_unlock(&rt_rq->rt_runtime_lock); +		raw_spin_unlock(&rt_rq->rt_runtime_lock);  	} -	spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); +	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);  	return 0;  }  #endif /* CONFIG_RT_GROUP_SCHED */  int sched_rt_handler(struct ctl_table *table, int write, -		struct file *filp, void __user *buffer, size_t *lenp, +		void __user *buffer, size_t *lenp,  		loff_t *ppos)  {  	int ret; @@ -10141,7 +8777,7 @@ int sched_rt_handler(struct ctl_table *table, int write,  	old_period = sysctl_sched_rt_period;  	old_runtime = sysctl_sched_rt_runtime; -	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); +	ret = proc_dointvec(table, write, buffer, lenp, ppos);  	if (!ret && write) {  		ret = sched_rt_global_constraints(); @@ -10175,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)  	if (!cgrp->parent) {  		/* This is early initialization for the top cgroup */ -		return &init_task_group.css; +		return &root_task_group.css;  	}  	parent = cgroup_tg(cgrp->parent); @@ -10195,8 +8831,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)  }  static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, -		      struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  {  #ifdef CONFIG_RT_GROUP_SCHED  	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10206,15 +8841,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  	if (tsk->sched_class != &fair_sched_class)  		return -EINVAL;  #endif +	return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +		      struct task_struct *tsk, bool threadgroup) +{ +	int retval = cpu_cgroup_can_attach_task(cgrp, tsk); +	if (retval) +		return retval; +	if (threadgroup) { +		struct task_struct *c; +		rcu_read_lock(); +		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { +			retval = cpu_cgroup_can_attach_task(cgrp, c); +			if (retval) { +				rcu_read_unlock(); +				return retval; +			} +		} +		rcu_read_unlock(); +	}  	return 0;  }  static void  cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, -			struct cgroup *old_cont, struct task_struct *tsk) +		  struct cgroup *old_cont, struct task_struct *tsk, +		  bool threadgroup)  {  	sched_move_task(tsk); +	if (threadgroup) { +		struct task_struct *c; +		rcu_read_lock(); +		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { +			sched_move_task(c); +		} +		rcu_read_unlock(); +	}  }  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10309,7 +8974,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {  struct cpuacct {  	struct cgroup_subsys_state css;  	/* cpuusage holds pointer to a u64-type object on every cpu */ -	u64 *cpuusage; +	u64 __percpu *cpuusage;  	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];  	struct cpuacct *parent;  }; @@ -10385,9 +9050,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)  	/*  	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.  	 */ -	spin_lock_irq(&cpu_rq(cpu)->lock); +	raw_spin_lock_irq(&cpu_rq(cpu)->lock);  	data = *cpuusage; -	spin_unlock_irq(&cpu_rq(cpu)->lock); +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);  #else  	data = *cpuusage;  #endif @@ -10403,9 +9068,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)  	/*  	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.  	 */ -	spin_lock_irq(&cpu_rq(cpu)->lock); +	raw_spin_lock_irq(&cpu_rq(cpu)->lock);  	*cpuusage = val; -	spin_unlock_irq(&cpu_rq(cpu)->lock); +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);  #else  	*cpuusage = val;  #endif @@ -10526,12 +9191,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)  }  /* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH	\ +	min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH	0 +#endif + +/*   * Charge the system/user time to the task's accounting group.   */  static void cpuacct_update_stats(struct task_struct *tsk,  		enum cpuacct_stat_index idx, cputime_t val)  {  	struct cpuacct *ca; +	int batch = CPUACCT_BATCH;  	if (unlikely(!cpuacct_subsys.active))  		return; @@ -10540,7 +9223,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,  	ca = task_ca(tsk);  	do { -		percpu_counter_add(&ca->cpustat[idx], val); +		__percpu_counter_add(&ca->cpustat[idx], val, batch);  		ca = ca->parent;  	} while (ca);  	rcu_read_unlock(); @@ -10554,3 +9237,4 @@ struct cgroup_subsys cpuacct_subsys = {  	.subsys_id = cpuacct_subsys_id,  };  #endif	/* CONFIG_CGROUP_CPUACCT */ + | 
