diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 2376 | 
1 files changed, 1456 insertions, 920 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995..bc1638b3344 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@  #include <linux/init_task.h>  #include <linux/binfmts.h>  #include <linux/context_tracking.h> +#include <linux/compiler.h>  #include <asm/switch_to.h>  #include <asm/tlb.h> @@ -89,6 +90,22 @@  #define CREATE_TRACE_POINTS  #include <trace/events/sched.h> +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ +	smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ +	smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif +  void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)  {  	unsigned long delta; @@ -296,8 +313,6 @@ __read_mostly int scheduler_running;   */  int sysctl_sched_rt_runtime = 950000; - -  /*   * __task_rq_lock - lock the rq @p resides on.   */ @@ -434,7 +449,7 @@ void hrtick_start(struct rq *rq, u64 delay)  	if (rq == this_rq()) {  		__hrtick_restart(rq);  	} else if (!rq->hrtick_csd_pending) { -		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); +		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);  		rq->hrtick_csd_pending = 1;  	}  } @@ -507,32 +522,98 @@ static inline void init_hrtick(void)  #endif	/* CONFIG_SCHED_HRTICK */  /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val)						\ +({	typeof(*(ptr)) __old, __val = *(ptr);				\ + 	for (;;) {							\ + 		__old = cmpxchg((ptr), __val, __val | (val));		\ + 		if (__old == __val)					\ + 			break;						\ + 		__val = __old;						\ + 	}								\ + 	__old;								\ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ +	struct thread_info *ti = task_thread_info(p); +	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + +	for (;;) { +		if (!(val & _TIF_POLLING_NRFLAG)) +			return false; +		if (val & _TIF_NEED_RESCHED) +			return true; +		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); +		if (old == val) +			break; +		val = old; +	} +	return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ +	set_tsk_need_resched(p); +	return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ +	return false; +} +#endif +#endif + +/*   * resched_task - mark a task 'to be rescheduled now'.   *   * On UP this means the setting of the need_resched flag, on SMP it   * might also involve a cross-CPU call to trigger the scheduler on   * the target CPU.   */ -#ifdef CONFIG_SMP  void resched_task(struct task_struct *p)  {  	int cpu; -	assert_raw_spin_locked(&task_rq(p)->lock); +	lockdep_assert_held(&task_rq(p)->lock);  	if (test_tsk_need_resched(p))  		return; -	set_tsk_need_resched(p); -  	cpu = task_cpu(p); -	if (cpu == smp_processor_id()) + +	if (cpu == smp_processor_id()) { +		set_tsk_need_resched(p); +		set_preempt_need_resched();  		return; +	} -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(p)) +	if (set_nr_and_not_polling(p))  		smp_send_reschedule(cpu); +	else +		trace_sched_wake_idle_without_ipi(cpu);  }  void resched_cpu(int cpu) @@ -546,6 +627,7 @@ void resched_cpu(int cpu)  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } +#ifdef CONFIG_SMP  #ifdef CONFIG_NO_HZ_COMMON  /*   * In the semi idle case, use the nearest busy cpu for migrating timers @@ -555,12 +637,15 @@ void resched_cpu(int cpu)   * selecting an idle cpu will add more delays to the timers than intended   * (as that cpu's timer base may not be uptodate wrt jiffies etc).   */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned)  {  	int cpu = smp_processor_id();  	int i;  	struct sched_domain *sd; +	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) +		return cpu; +  	rcu_read_lock();  	for_each_domain(cpu, sd) {  		for_each_cpu(i, sched_domain_span(sd)) { @@ -591,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)  	if (cpu == smp_processor_id())  		return; -	/* -	 * This is safe, as this function is called with the timer -	 * wheel base lock of (cpu) held. When the CPU is on the way -	 * to idle and has not yet set rq->curr to idle then it will -	 * be serialized on the timer wheel base lock and take the new -	 * timer into account automatically. -	 */ -	if (rq->curr != rq->idle) -		return; - -	/* -	 * We can set TIF_RESCHED on the idle task of the other CPU -	 * lockless. The worst case is that the other CPU runs the -	 * idle task through an additional NOOP schedule() -	 */ -	set_tsk_need_resched(rq->idle); - -	/* NEED_RESCHED must be visible before we test polling */ -	smp_mb(); -	if (!tsk_is_polling(rq->idle)) +	if (set_nr_and_not_polling(rq->idle))  		smp_send_reschedule(cpu); +	else +		trace_sched_wake_idle_without_ipi(cpu);  }  static bool wake_up_full_nohz_cpu(int cpu) @@ -693,12 +761,6 @@ void sched_avg_update(struct rq *rq)  	}  } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ -	assert_raw_spin_locked(&task_rq(p)->lock); -	set_tsk_need_resched(p); -}  #endif /* CONFIG_SMP */  #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +829,14 @@ static void set_load_weight(struct task_struct *p)  static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)  {  	update_rq_clock(rq); -	sched_info_queued(p); +	sched_info_queued(rq, p);  	p->sched_class->enqueue_task(rq, p, flags);  }  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)  {  	update_rq_clock(rq); -	sched_info_dequeued(p); +	sched_info_dequeued(rq, p);  	p->sched_class->dequeue_task(rq, p, flags);  } @@ -829,19 +891,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  #endif  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING  	if (static_key_false((¶virt_steal_rq_enabled))) { -		u64 st; -  		steal = paravirt_steal_clock(cpu_of(rq));  		steal -= rq->prev_steal_time_rq;  		if (unlikely(steal > delta))  			steal = delta; -		st = steal_ticks(steal); -		steal = st * TICK_NSEC; -  		rq->prev_steal_time_rq += steal; -  		delta -= steal;  	}  #endif @@ -849,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  	rq->clock_task += delta;  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -	if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) +	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))  		sched_rt_avg_update(rq, irq_delta + steal);  #endif  } @@ -903,7 +959,9 @@ static inline int normal_prio(struct task_struct *p)  {  	int prio; -	if (task_has_rt_policy(p)) +	if (task_has_dl_policy(p)) +		prio = MAX_DL_PRIO-1; +	else if (task_has_rt_policy(p))  		prio = MAX_RT_PRIO-1 - p->rt_priority;  	else  		prio = __normal_prio(p); @@ -949,7 +1007,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,  		if (prev_class->switched_from)  			prev_class->switched_from(rq, p);  		p->sched_class->switched_to(rq, p); -	} else if (oldprio != p->prio) +	} else if (oldprio != p->prio || dl_task(p))  		p->sched_class->prio_changed(rq, p, oldprio);  } @@ -987,7 +1045,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	 * ttwu() will sort out the placement.  	 */  	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +			!(task_preempt_count(p) & PREEMPT_ACTIVE));  #ifdef CONFIG_LOCKDEP  	/* @@ -1017,6 +1075,108 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	__set_task_cpu(p, new_cpu);  } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ +	if (p->on_rq) { +		struct rq *src_rq, *dst_rq; + +		src_rq = task_rq(p); +		dst_rq = cpu_rq(cpu); + +		deactivate_task(src_rq, p, 0); +		set_task_cpu(p, cpu); +		activate_task(dst_rq, p, 0); +		check_preempt_curr(dst_rq, p, 0); +	} else { +		/* +		 * Task isn't running anymore; make it appear like we migrated +		 * it before it went to sleep. This means on wakeup we make the +		 * previous cpu our targer instead of where it really is. +		 */ +		p->wake_cpu = cpu; +	} +} + +struct migration_swap_arg { +	struct task_struct *src_task, *dst_task; +	int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ +	struct migration_swap_arg *arg = data; +	struct rq *src_rq, *dst_rq; +	int ret = -EAGAIN; + +	src_rq = cpu_rq(arg->src_cpu); +	dst_rq = cpu_rq(arg->dst_cpu); + +	double_raw_lock(&arg->src_task->pi_lock, +			&arg->dst_task->pi_lock); +	double_rq_lock(src_rq, dst_rq); +	if (task_cpu(arg->dst_task) != arg->dst_cpu) +		goto unlock; + +	if (task_cpu(arg->src_task) != arg->src_cpu) +		goto unlock; + +	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) +		goto unlock; + +	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) +		goto unlock; + +	__migrate_swap_task(arg->src_task, arg->dst_cpu); +	__migrate_swap_task(arg->dst_task, arg->src_cpu); + +	ret = 0; + +unlock: +	double_rq_unlock(src_rq, dst_rq); +	raw_spin_unlock(&arg->dst_task->pi_lock); +	raw_spin_unlock(&arg->src_task->pi_lock); + +	return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ +	struct migration_swap_arg arg; +	int ret = -EINVAL; + +	arg = (struct migration_swap_arg){ +		.src_task = cur, +		.src_cpu = task_cpu(cur), +		.dst_task = p, +		.dst_cpu = task_cpu(p), +	}; + +	if (arg.src_cpu == arg.dst_cpu) +		goto out; + +	/* +	 * These three tests are all lockless; this is OK since all of them +	 * will be re-checked with proper locks held further down the line. +	 */ +	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) +		goto out; + +	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) +		goto out; + +	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) +		goto out; + +	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); +	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: +	return ret; +} +  struct migration_arg {  	struct task_struct *task;  	int dest_cpu; @@ -1224,7 +1384,7 @@ out:  		 * leave kernel.  		 */  		if (p->mm && printk_ratelimit()) { -			printk_sched("process %d (%s) no longer affine to cpu%d\n", +			printk_deferred("process %d (%s) no longer affine to cpu%d\n",  					task_pid_nr(p), p->comm, cpu);  		}  	} @@ -1236,9 +1396,9 @@ out:   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.   */  static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)  { -	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); +	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);  	/*  	 * In order not to call set_task_cpu() on a blocking task we need @@ -1330,12 +1490,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  	if (rq->idle_stamp) {  		u64 delta = rq_clock(rq) - rq->idle_stamp; -		u64 max = 2*sysctl_sched_migration_cost; +		u64 max = 2*rq->max_idle_balance_cost; + +		update_avg(&rq->avg_idle, delta); -		if (delta > max) +		if (rq->avg_idle > max)  			rq->avg_idle = max; -		else -			update_avg(&rq->avg_idle, delta); +  		rq->idle_stamp = 0;  	}  #endif @@ -1377,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)  }  #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void)  {  	struct rq *rq = this_rq();  	struct llist_node *llist = llist_del_all(&rq->wake_list);  	struct task_struct *p; +	unsigned long flags; -	raw_spin_lock(&rq->lock); +	if (!llist) +		return; + +	raw_spin_lock_irqsave(&rq->lock, flags);  	while (llist) {  		p = llist_entry(llist, struct task_struct, wake_entry); @@ -1391,11 +1556,18 @@ static void sched_ttwu_pending(void)  		ttwu_do_activate(rq, p, 0);  	} -	raw_spin_unlock(&rq->lock); +	raw_spin_unlock_irqrestore(&rq->lock, flags);  }  void scheduler_ipi(void)  { +	/* +	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting +	 * TIF_NEED_RESCHED remotely (for the first time) will also send +	 * this IPI. +	 */ +	preempt_fold_need_resched(); +  	if (llist_empty(&this_rq()->wake_list)  			&& !tick_nohz_full_cpu(smp_processor_id())  			&& !got_nohz_idle_kick()) @@ -1430,8 +1602,14 @@ void scheduler_ipi(void)  static void ttwu_queue_remote(struct task_struct *p, int cpu)  { -	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) -		smp_send_reschedule(cpu); +	struct rq *rq = cpu_rq(cpu); + +	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { +		if (!set_nr_if_polling(rq->idle)) +			smp_send_reschedule(cpu); +		else +			trace_sched_wake_idle_without_ipi(cpu); +	}  }  bool cpus_share_cache(int this_cpu, int that_cpu) @@ -1513,7 +1691,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	if (p->sched_class->task_waking)  		p->sched_class->task_waking(p); -	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);  	if (task_cpu(p) != cpu) {  		wake_flags |= WF_MIGRATED;  		set_task_cpu(p, cpu); @@ -1595,7 +1773,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)   *   * __sched_fork() is basic setup used by init_idle() too:   */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	p->on_rq			= 0; @@ -1611,6 +1789,13 @@ static void __sched_fork(struct task_struct *p)  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif +	RB_CLEAR_NODE(&p->dl.rb_node); +	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	p->dl.dl_runtime = p->dl.runtime = 0; +	p->dl.dl_deadline = p->dl.deadline = 0; +	p->dl.dl_period = 0; +	p->dl.flags = 0; +  	INIT_LIST_HEAD(&p->rt.run_list);  #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1619,16 +1804,26 @@ static void __sched_fork(struct task_struct *p)  #ifdef CONFIG_NUMA_BALANCING  	if (p->mm && atomic_read(&p->mm->mm_users) == 1) { -		p->mm->numa_next_scan = jiffies; -		p->mm->numa_next_reset = jiffies; +		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);  		p->mm->numa_scan_seq = 0;  	} +	if (clone_flags & CLONE_VM) +		p->numa_preferred_nid = current->numa_preferred_nid; +	else +		p->numa_preferred_nid = -1; +  	p->node_stamp = 0ULL;  	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; -	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;  	p->numa_scan_period = sysctl_numa_balancing_scan_delay;  	p->numa_work.next = &p->numa_work; +	p->numa_faults_memory = NULL; +	p->numa_faults_buffer_memory = NULL; +	p->last_task_numa_placement = 0; +	p->last_sum_exec_runtime = 0; + +	INIT_LIST_HEAD(&p->numa_entry); +	p->numa_group = NULL;  #endif /* CONFIG_NUMA_BALANCING */  } @@ -1649,17 +1844,39 @@ void set_numabalancing_state(bool enabled)  	numabalancing_enabled = enabled;  }  #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, +			 void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct ctl_table t; +	int err; +	int state = numabalancing_enabled; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	t = *table; +	t.data = &state; +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); +	if (err < 0) +		return err; +	if (write) +		set_numabalancing_state(state); +	return err; +} +#endif +#endif  /*   * fork()/clone()-time setup:   */ -void sched_fork(struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	unsigned long flags;  	int cpu = get_cpu(); -	__sched_fork(p); +	__sched_fork(clone_flags, p);  	/*  	 * We mark the process as running here. This guarantees that  	 * nobody will actually run it, and a signal or other external @@ -1676,7 +1893,7 @@ void sched_fork(struct task_struct *p)  	 * Revert to default priority/policy on fork if requested.  	 */  	if (unlikely(p->sched_reset_on_fork)) { -		if (task_has_rt_policy(p)) { +		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  			p->policy = SCHED_NORMAL;  			p->static_prio = NICE_TO_PRIO(0);  			p->rt_priority = 0; @@ -1693,8 +1910,14 @@ void sched_fork(struct task_struct *p)  		p->sched_reset_on_fork = 0;  	} -	if (!rt_prio(p->prio)) +	if (dl_prio(p->prio)) { +		put_cpu(); +		return -EAGAIN; +	} else if (rt_prio(p->prio)) { +		p->sched_class = &rt_sched_class; +	} else {  		p->sched_class = &fair_sched_class; +	}  	if (p->sched_class->task_fork)  		p->sched_class->task_fork(p); @@ -1717,17 +1940,127 @@ void sched_fork(struct task_struct *p)  #if defined(CONFIG_SMP)  	p->on_cpu = 0;  #endif -#ifdef CONFIG_PREEMPT_COUNT -	/* Want to start with kernel preemption disabled. */ -	task_thread_info(p)->preempt_count = 1; -#endif +	init_task_preempt_count(p);  #ifdef CONFIG_SMP  	plist_node_init(&p->pushable_tasks, MAX_PRIO); +	RB_CLEAR_NODE(&p->pushable_dl_tasks);  #endif  	put_cpu(); +	return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ +	if (runtime == RUNTIME_INF) +		return 1ULL << 20; + +	/* +	 * Doing this here saves a lot of checks in all +	 * the calling paths, and returning zero seems +	 * safe for them anyway. +	 */ +	if (period == 0) +		return 0; + +	return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ +	return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ +	struct root_domain *rd = cpu_rq(i)->rd; +	int cpus = 0; + +	for_each_cpu_and(i, rd->span, cpu_active_mask) +		cpus++; + +	return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ +	return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ +	return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ +	dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ +	dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ +	return dl_b->bw != -1 && +	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, +		       const struct sched_attr *attr) +{ + +	struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); +	u64 period = attr->sched_period ?: attr->sched_deadline; +	u64 runtime = attr->sched_runtime; +	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; +	int cpus, err = -1; + +	if (new_bw == p->dl.dl_bw) +		return 0; + +	/* +	 * Either if a task, enters, leave, or stays -deadline but changes +	 * its parameters, we may need to update accordingly the total +	 * allocated bandwidth of the container. +	 */ +	raw_spin_lock(&dl_b->lock); +	cpus = dl_bw_cpus(task_cpu(p)); +	if (dl_policy(policy) && !task_has_dl_policy(p) && +	    !__dl_overflow(dl_b, cpus, 0, new_bw)) { +		__dl_add(dl_b, new_bw); +		err = 0; +	} else if (dl_policy(policy) && task_has_dl_policy(p) && +		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { +		__dl_clear(dl_b, p->dl.dl_bw); +		__dl_add(dl_b, new_bw); +		err = 0; +	} else if (!dl_policy(policy) && task_has_dl_policy(p)) { +		__dl_clear(dl_b, p->dl.dl_bw); +		err = 0; +	} +	raw_spin_unlock(&dl_b->lock); + +	return err;  } +extern void init_dl_bw(struct dl_bw *dl_b); +  /*   * wake_up_new_task - wake up a newly created task for the first time.   * @@ -1747,7 +2080,7 @@ void wake_up_new_task(struct task_struct *p)  	 *  - cpus_allowed can change in the fork path  	 *  - any previously selected cpu might disappear through hotplug  	 */ -	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif  	/* Initialize new task's runnable average */ @@ -1838,7 +2171,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,  		    struct task_struct *next)  {  	trace_sched_switch(prev, next); -	sched_info_switch(prev, next); +	sched_info_switch(rq, prev, next);  	perf_event_task_sched_out(prev, next);  	fire_sched_out_preempt_notifiers(prev, next);  	prepare_lock_switch(rq, next); @@ -1890,6 +2223,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	if (mm)  		mmdrop(mm);  	if (unlikely(prev_state == TASK_DEAD)) { +		if (prev->sched_class->task_dead) +			prev->sched_class->task_dead(prev); +  		/*  		 * Remove function-return probe instances associated with this  		 * task and put them back on the free list. @@ -1903,13 +2239,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ -	if (prev->sched_class->pre_schedule) -		prev->sched_class->pre_schedule(rq, prev); -} -  /* rq->lock is NOT held, but preemption is disabled */  static inline void post_schedule(struct rq *rq)  { @@ -1927,10 +2256,6 @@ static inline void post_schedule(struct rq *rq)  #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} -  static inline void post_schedule(struct rq *rq)  {  } @@ -1941,7 +2266,7 @@ static inline void post_schedule(struct rq *rq)   * schedule_tail - first thing a freshly forked thread must call.   * @prev: the thread we just switched away from.   */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev)  	__releases(rq->lock)  {  	struct rq *rq = this_rq(); @@ -2073,7 +2398,7 @@ void sched_exec(void)  	int dest_cpu;  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); +	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);  	if (dest_cpu == smp_processor_id())  		goto unlock; @@ -2140,6 +2465,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)  	struct rq *rq;  	u64 ns = 0; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +	/* +	 * 64-bit doesn't need locks to atomically read a 64bit value. +	 * So we have a optimization chance when the task's delta_exec is 0. +	 * Reading ->on_cpu is racy, but this is ok. +	 * +	 * If we race with it leaving cpu, we'll take a lock. So we're correct. +	 * If we race with it entering cpu, unaccounted time is 0. This is +	 * indistinguishable from the read occurring a few cycles earlier. +	 */ +	if (!p->on_cpu) +		return p->se.sum_exec_runtime; +#endif +  	rq = task_rq_lock(p, &flags);  	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);  	task_rq_unlock(rq, p, &flags); @@ -2169,7 +2508,7 @@ void scheduler_tick(void)  #ifdef CONFIG_SMP  	rq->idle_balance = idle_cpu(cpu); -	trigger_load_balance(rq, cpu); +	trigger_load_balance(rq);  #endif  	rq_last_tick_reset(rq);  } @@ -2198,7 +2537,7 @@ u64 scheduler_tick_max_deferment(void)  	if (time_before_eq(next, now))  		return 0; -	return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +	return jiffies_to_nsecs(next - now);  }  #endif @@ -2215,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void preempt_count_add(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT  	/* @@ -2224,7 +2563,7 @@ void __kprobes add_preempt_count(int val)  	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))  		return;  #endif -	preempt_count() += val; +	__preempt_count_add(val);  #ifdef CONFIG_DEBUG_PREEMPT  	/*  	 * Spinlock count overflowing soon? @@ -2232,12 +2571,18 @@ void __kprobes add_preempt_count(int val)  	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=  				PREEMPT_MASK - 10);  #endif -	if (preempt_count() == val) -		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +	if (preempt_count() == val) { +		unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT +		current->preempt_disable_ip = ip; +#endif +		trace_preempt_off(CALLER_ADDR0, ip); +	}  } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void preempt_count_sub(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT  	/* @@ -2255,9 +2600,10 @@ void __kprobes sub_preempt_count(int val)  	if (preempt_count() == val)  		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -	preempt_count() -= val; +	__preempt_count_sub(val);  } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub);  #endif @@ -2276,6 +2622,13 @@ static noinline void __schedule_bug(struct task_struct *prev)  	print_modules();  	if (irqs_disabled())  		print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT +	if (in_atomic_preempt_off()) { +		pr_err("Preemption disabled at:"); +		print_ip_sym(current->preempt_disable_ip); +		pr_cont("\n"); +	} +#endif  	dump_stack();  	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);  } @@ -2287,10 +2640,10 @@ static inline void schedule_debug(struct task_struct *prev)  {  	/*  	 * Test if we are atomic. Since do_exit() needs to call into -	 * schedule() atomically, we ignore that path for now. -	 * Otherwise, whine if we are scheduling when we should not be. +	 * schedule() atomically, we ignore that path. Otherwise whine +	 * if we are scheduling when we should not.  	 */ -	if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) +	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))  		__schedule_bug(prev);  	rcu_sleep_check(); @@ -2299,36 +2652,40 @@ static inline void schedule_debug(struct task_struct *prev)  	schedstat_inc(this_rq(), sched_count);  } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ -	if (prev->on_rq || rq->skip_clock_update < 0) -		update_rq_clock(rq); -	prev->sched_class->put_prev_task(rq, prev); -} -  /*   * Pick up the highest-prio task:   */  static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev)  { -	const struct sched_class *class; +	const struct sched_class *class = &fair_sched_class;  	struct task_struct *p;  	/*  	 * Optimization: we know that if all tasks are in  	 * the fair class we can call that function directly:  	 */ -	if (likely(rq->nr_running == rq->cfs.h_nr_running)) { -		p = fair_sched_class.pick_next_task(rq); -		if (likely(p)) -			return p; +	if (likely(prev->sched_class == class && +		   rq->nr_running == rq->cfs.h_nr_running)) { +		p = fair_sched_class.pick_next_task(rq, prev); +		if (unlikely(p == RETRY_TASK)) +			goto again; + +		/* assumes fair_sched_class->next == idle_sched_class */ +		if (unlikely(!p)) +			p = idle_sched_class.pick_next_task(rq, prev); + +		return p;  	} +again:  	for_each_class(class) { -		p = class->pick_next_task(rq); -		if (p) +		p = class->pick_next_task(rq, prev); +		if (p) { +			if (unlikely(p == RETRY_TASK)) +				goto again;  			return p; +		}  	}  	BUG(); /* the idle class will always have a runnable task */ @@ -2422,14 +2779,12 @@ need_resched:  		switch_count = &prev->nvcsw;  	} -	pre_schedule(rq, prev); - -	if (unlikely(!rq->nr_running)) -		idle_balance(cpu, rq); +	if (prev->on_rq || rq->skip_clock_update < 0) +		update_rq_clock(rq); -	put_prev_task(rq, prev); -	next = pick_next_task(rq); +	next = pick_next_task(rq, prev);  	clear_tsk_need_resched(prev); +	clear_preempt_need_resched();  	rq->skip_clock_update = 0;  	if (likely(prev != next)) { @@ -2468,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk)  		blk_schedule_flush_plug(tsk);  } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void)  {  	struct task_struct *tsk = current; @@ -2478,7 +2833,7 @@ asmlinkage void __sched schedule(void)  EXPORT_SYMBOL(schedule);  #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void)  {  	/*  	 * If we come here after a random call to set_need_resched(), @@ -2510,7 +2865,7 @@ void __sched schedule_preempt_disabled(void)   * off of preempt_enable. Kernel preemptions off return from interrupt   * occur there and call schedule directly.   */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void)  {  	/*  	 * If there is a non-zero preempt_count or interrupts are disabled, @@ -2520,9 +2875,9 @@ asmlinkage void __sched notrace preempt_schedule(void)  		return;  	do { -		add_preempt_count_notrace(PREEMPT_ACTIVE); +		__preempt_count_add(PREEMPT_ACTIVE);  		__schedule(); -		sub_preempt_count_notrace(PREEMPT_ACTIVE); +		__preempt_count_sub(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -2531,7 +2886,9 @@ asmlinkage void __sched notrace preempt_schedule(void)  		barrier();  	} while (need_resched());  } +NOKPROBE_SYMBOL(preempt_schedule);  EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */  /*   * this is the entry point to schedule() from kernel preemption @@ -2539,22 +2896,21 @@ EXPORT_SYMBOL(preempt_schedule);   * Note, that this is called and return with irqs disabled. This will   * protect us against recursive calling from irq.   */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void)  { -	struct thread_info *ti = current_thread_info();  	enum ctx_state prev_state;  	/* Catch callers which need to be fixed */ -	BUG_ON(ti->preempt_count || !irqs_disabled()); +	BUG_ON(preempt_count() || !irqs_disabled());  	prev_state = exception_enter();  	do { -		add_preempt_count(PREEMPT_ACTIVE); +		__preempt_count_add(PREEMPT_ACTIVE);  		local_irq_enable();  		__schedule();  		local_irq_disable(); -		sub_preempt_count(PREEMPT_ACTIVE); +		__preempt_count_sub(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -2566,8 +2922,6 @@ asmlinkage void __sched preempt_schedule_irq(void)  	exception_exit(prev_state);  } -#endif /* CONFIG_PREEMPT */ -  int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  			  void *key)  { @@ -2575,439 +2929,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  }  EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, int wake_flags, void *key) -{ -	wait_queue_t *curr, *next; - -	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { -		unsigned flags = curr->flags; - -		if (curr->func(curr, mode, wake_flags, key) && -				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) -			break; -	} -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, void *key) -{ -	unsigned long flags; - -	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, 0, key); -	spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ -	__wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ -	__wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, void *key) -{ -	unsigned long flags; -	int wake_flags = WF_SYNC; - -	if (unlikely(!q)) -		return; - -	if (unlikely(nr_exclusive != 1)) -		wake_flags = 0; - -	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); -	spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ -	__wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x:  holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ -	unsigned long flags; - -	spin_lock_irqsave(&x->wait.lock, flags); -	x->done++; -	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); -	spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x:  holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ -	unsigned long flags; - -	spin_lock_irqsave(&x->wait.lock, flags); -	x->done += UINT_MAX/2; -	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); -	spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, -		   long (*action)(long), long timeout, int state) -{ -	if (!x->done) { -		DECLARE_WAITQUEUE(wait, current); - -		__add_wait_queue_tail_exclusive(&x->wait, &wait); -		do { -			if (signal_pending_state(state, current)) { -				timeout = -ERESTARTSYS; -				break; -			} -			__set_current_state(state); -			spin_unlock_irq(&x->wait.lock); -			timeout = action(timeout); -			spin_lock_irq(&x->wait.lock); -		} while (!x->done && timeout); -		__remove_wait_queue(&x->wait, &wait); -		if (!x->done) -			return timeout; -	} -	x->done--; -	return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, -		  long (*action)(long), long timeout, int state) -{ -	might_sleep(); - -	spin_lock_irq(&x->wait.lock); -	timeout = do_wait_for_common(x, action, timeout, state); -	spin_unlock_irq(&x->wait.lock); -	return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ -	return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ -	return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ -	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ -	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ -	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x:  holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ -	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); -	if (t == -ERESTARTSYS) -		return t; -	return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, -					  unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ -	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); -	if (t == -ERESTARTSYS) -		return t; -	return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, -				     unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - *	try_wait_for_completion - try to decrement a completion without blocking - *	@x:	completion structure - * - *	Return: 0 if a decrement cannot be done without blocking - *		 1 if a decrement succeeded. - * - *	If a completion is being used as a counting completion, - *	attempt to decrement the counter without blocking. This - *	enables us to avoid waiting if the resource the completion - *	is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ -	unsigned long flags; -	int ret = 1; - -	spin_lock_irqsave(&x->wait.lock, flags); -	if (!x->done) -		ret = 0; -	else -		x->done--; -	spin_unlock_irqrestore(&x->wait.lock, flags); -	return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - *	completion_done - Test to see if a completion has any waiters - *	@x:	completion structure - * - *	Return: 0 if there are waiters (wait_for_completion() in progress) - *		 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ -	unsigned long flags; -	int ret = 1; - -	spin_lock_irqsave(&x->wait.lock, flags); -	if (!x->done) -		ret = 0; -	spin_unlock_irqrestore(&x->wait.lock, flags); -	return ret; -} -EXPORT_SYMBOL(completion_done); - -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ -	unsigned long flags; -	wait_queue_t wait; - -	init_waitqueue_entry(&wait, current); - -	__set_current_state(state); - -	spin_lock_irqsave(&q->lock, flags); -	__add_wait_queue(q, &wait); -	spin_unlock(&q->lock); -	timeout = schedule_timeout(timeout); -	spin_lock_irq(&q->lock); -	__remove_wait_queue(q, &wait); -	spin_unlock_irqrestore(&q->lock, flags); - -	return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ -	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ -	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ -	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ -	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); -  #ifdef CONFIG_RT_MUTEXES  /* @@ -3018,15 +2939,16 @@ EXPORT_SYMBOL(sleep_on_timeout);   * This function changes the 'effective' priority of a task. It does   * not touch ->normal_prio like __setscheduler().   * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed.   */  void rt_mutex_setprio(struct task_struct *p, int prio)  { -	int oldprio, on_rq, running; +	int oldprio, on_rq, running, enqueue_flag = 0;  	struct rq *rq;  	const struct sched_class *prev_class; -	BUG_ON(prio < 0 || prio > MAX_PRIO); +	BUG_ON(prio > MAX_PRIO);  	rq = __task_rq_lock(p); @@ -3049,6 +2971,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	}  	trace_sched_pi_setprio(p, prio); +	p->pi_top_task = rt_mutex_get_top_task(p);  	oldprio = p->prio;  	prev_class = p->sched_class;  	on_rq = p->on_rq; @@ -3058,30 +2981,56 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	if (running)  		p->sched_class->put_prev_task(rq, p); -	if (rt_prio(prio)) +	/* +	 * Boosting condition are: +	 * 1. -rt task is running and holds mutex A +	 *      --> -dl task blocks on mutex A +	 * +	 * 2. -dl task is running and holds mutex A +	 *      --> -dl task blocks on mutex A and could preempt the +	 *          running task +	 */ +	if (dl_prio(prio)) { +		if (!dl_prio(p->normal_prio) || (p->pi_top_task && +			dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { +			p->dl.dl_boosted = 1; +			p->dl.dl_throttled = 0; +			enqueue_flag = ENQUEUE_REPLENISH; +		} else +			p->dl.dl_boosted = 0; +		p->sched_class = &dl_sched_class; +	} else if (rt_prio(prio)) { +		if (dl_prio(oldprio)) +			p->dl.dl_boosted = 0; +		if (oldprio < prio) +			enqueue_flag = ENQUEUE_HEAD;  		p->sched_class = &rt_sched_class; -	else +	} else { +		if (dl_prio(oldprio)) +			p->dl.dl_boosted = 0;  		p->sched_class = &fair_sched_class; +	}  	p->prio = prio;  	if (running)  		p->sched_class->set_curr_task(rq);  	if (on_rq) -		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); +		enqueue_task(rq, p, enqueue_flag);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock:  	__task_rq_unlock(rq);  }  #endif +  void set_user_nice(struct task_struct *p, long nice)  {  	int old_prio, delta, on_rq;  	unsigned long flags;  	struct rq *rq; -	if (TASK_NICE(p) == nice || nice < -20 || nice > 19) +	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)  		return;  	/*  	 * We have to be careful, if called from sys_setpriority(), @@ -3092,9 +3041,9 @@ void set_user_nice(struct task_struct *p, long nice)  	 * The RT priorities are set via sched_setscheduler(), but we still  	 * allow the 'normal' nice value to be set - but as expected  	 * it wont have any effect on scheduling until the task is -	 * SCHED_FIFO/SCHED_RR: +	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:  	 */ -	if (task_has_rt_policy(p)) { +	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  		p->static_prio = NICE_TO_PRIO(nice);  		goto out_unlock;  	} @@ -3130,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);  int can_nice(const struct task_struct *p, const int nice)  {  	/* convert nice value [19,-20] to rlimit style value [1,40] */ -	int nice_rlim = 20 - nice; +	int nice_rlim = nice_to_rlimit(nice);  	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||  		capable(CAP_SYS_NICE)); @@ -3154,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)  	 * We don't have to worry. Conceptually one call occurs first  	 * and we have a single winner.  	 */ -	if (increment < -40) -		increment = -40; -	if (increment > 40) -		increment = 40; - -	nice = TASK_NICE(current) + increment; -	if (nice < -20) -		nice = -20; -	if (nice > 19) -		nice = 19; +	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); +	nice = task_nice(current) + increment; +	nice = clamp_val(nice, MIN_NICE, MAX_NICE);  	if (increment < 0 && !can_nice(current, nice))  		return -EPERM; @@ -3192,18 +3134,6 @@ int task_prio(const struct task_struct *p)  }  /** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ -	return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/**   * idle_cpu - is a given cpu idle currently?   * @cpu: the processor in question.   * @@ -3249,20 +3179,126 @@ static struct task_struct *find_process_by_pid(pid_t pid)  	return pid ? find_task_by_vpid(pid) : current;  } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */  static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr)  { +	struct sched_dl_entity *dl_se = &p->dl; + +	init_dl_task_timer(dl_se); +	dl_se->dl_runtime = attr->sched_runtime; +	dl_se->dl_deadline = attr->sched_deadline; +	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; +	dl_se->flags = attr->sched_flags; +	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); +	dl_se->dl_throttled = 0; +	dl_se->dl_new = 1; +	dl_se->dl_yielded = 0; +} + +static void __setscheduler_params(struct task_struct *p, +		const struct sched_attr *attr) +{ +	int policy = attr->sched_policy; + +	if (policy == -1) /* setparam */ +		policy = p->policy; +  	p->policy = policy; -	p->rt_priority = prio; + +	if (dl_policy(policy)) +		__setparam_dl(p, attr); +	else if (fair_policy(policy)) +		p->static_prio = NICE_TO_PRIO(attr->sched_nice); + +	/* +	 * __sched_setscheduler() ensures attr->sched_priority == 0 when +	 * !rt_policy. Always setting this ensures that things like +	 * getparam()/getattr() don't report silly values for !rt tasks. +	 */ +	p->rt_priority = attr->sched_priority;  	p->normal_prio = normal_prio(p); -	/* we are holding p->pi_lock already */ -	p->prio = rt_mutex_getprio(p); -	if (rt_prio(p->prio)) +	set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, +			   const struct sched_attr *attr) +{ +	__setscheduler_params(p, attr); + +	/* +	 * If we get here, there was no pi waiters boosting the +	 * task. It is safe to use the normal prio. +	 */ +	p->prio = normal_prio(p); + +	if (dl_prio(p->prio)) +		p->sched_class = &dl_sched_class; +	else if (rt_prio(p->prio))  		p->sched_class = &rt_sched_class;  	else  		p->sched_class = &fair_sched_class; -	set_load_weight(p); +} + +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ +	struct sched_dl_entity *dl_se = &p->dl; + +	attr->sched_priority = p->rt_priority; +	attr->sched_runtime = dl_se->dl_runtime; +	attr->sched_deadline = dl_se->dl_deadline; +	attr->sched_period = dl_se->dl_period; +	attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ +	/* deadline != 0 */ +	if (attr->sched_deadline == 0) +		return false; + +	/* +	 * Since we truncate DL_SCALE bits, make sure we're at least +	 * that big. +	 */ +	if (attr->sched_runtime < (1ULL << DL_SCALE)) +		return false; + +	/* +	 * Since we use the MSB for wrap-around and sign issues, make +	 * sure it's not set (mind that period can be equal to zero). +	 */ +	if (attr->sched_deadline & (1ULL << 63) || +	    attr->sched_period & (1ULL << 63)) +		return false; + +	/* runtime <= deadline <= period (if period != 0) */ +	if ((attr->sched_period != 0 && +	     attr->sched_period < attr->sched_deadline) || +	    attr->sched_deadline < attr->sched_runtime) +		return false; + +	return true;  }  /* @@ -3281,10 +3317,14 @@ static bool check_same_owner(struct task_struct *p)  	return match;  } -static int __sched_setscheduler(struct task_struct *p, int policy, -				const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, +				const struct sched_attr *attr, +				bool user)  { +	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : +		      MAX_RT_PRIO - 1 - attr->sched_priority;  	int retval, oldprio, oldpolicy = -1, on_rq, running; +	int policy = attr->sched_policy;  	unsigned long flags;  	const struct sched_class *prev_class;  	struct rq *rq; @@ -3298,31 +3338,40 @@ recheck:  		reset_on_fork = p->sched_reset_on_fork;  		policy = oldpolicy = p->policy;  	} else { -		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -		policy &= ~SCHED_RESET_ON_FORK; +		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); -		if (policy != SCHED_FIFO && policy != SCHED_RR && +		if (policy != SCHED_DEADLINE && +				policy != SCHED_FIFO && policy != SCHED_RR &&  				policy != SCHED_NORMAL && policy != SCHED_BATCH &&  				policy != SCHED_IDLE)  			return -EINVAL;  	} +	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) +		return -EINVAL; +  	/*  	 * Valid priorities for SCHED_FIFO and SCHED_RR are  	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,  	 * SCHED_BATCH and SCHED_IDLE is 0.  	 */ -	if (param->sched_priority < 0 || -	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || -	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) +	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || +	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))  		return -EINVAL; -	if (rt_policy(policy) != (param->sched_priority != 0)) +	if ((dl_policy(policy) && !__checkparam_dl(attr)) || +	    (rt_policy(policy) != (attr->sched_priority != 0)))  		return -EINVAL;  	/*  	 * Allow unprivileged RT tasks to decrease priority:  	 */  	if (user && !capable(CAP_SYS_NICE)) { +		if (fair_policy(policy)) { +			if (attr->sched_nice < task_nice(p) && +			    !can_nice(p, attr->sched_nice)) +				return -EPERM; +		} +  		if (rt_policy(policy)) {  			unsigned long rlim_rtprio =  					task_rlimit(p, RLIMIT_RTPRIO); @@ -3332,17 +3381,26 @@ recheck:  				return -EPERM;  			/* can't increase priority */ -			if (param->sched_priority > p->rt_priority && -			    param->sched_priority > rlim_rtprio) +			if (attr->sched_priority > p->rt_priority && +			    attr->sched_priority > rlim_rtprio)  				return -EPERM;  		} +		 /* +		  * Can't set/change SCHED_DEADLINE policy at all for now +		  * (safest behavior); in the future we would like to allow +		  * unprivileged DL tasks to increase their relative deadline +		  * or reduce their runtime (both ways reducing utilization) +		  */ +		if (dl_policy(policy)) +			return -EPERM; +  		/*  		 * Treat SCHED_IDLE as nice 20. Only allow a switch to  		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.  		 */  		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { -			if (!can_nice(p, TASK_NICE(p))) +			if (!can_nice(p, task_nice(p)))  				return -EPERM;  		} @@ -3379,16 +3437,25 @@ recheck:  	}  	/* -	 * If not changing anything there's no need to proceed further: +	 * If not changing anything there's no need to proceed further, +	 * but store a possible modification of reset_on_fork.  	 */ -	if (unlikely(policy == p->policy && (!rt_policy(policy) || -			param->sched_priority == p->rt_priority))) { +	if (unlikely(policy == p->policy)) { +		if (fair_policy(policy) && attr->sched_nice != task_nice(p)) +			goto change; +		if (rt_policy(policy) && attr->sched_priority != p->rt_priority) +			goto change; +		if (dl_policy(policy)) +			goto change; + +		p->sched_reset_on_fork = reset_on_fork;  		task_rq_unlock(rq, p, &flags);  		return 0;  	} +change: -#ifdef CONFIG_RT_GROUP_SCHED  	if (user) { +#ifdef CONFIG_RT_GROUP_SCHED  		/*  		 * Do not allow realtime tasks into groups that have no runtime  		 * assigned. @@ -3399,8 +3466,24 @@ recheck:  			task_rq_unlock(rq, p, &flags);  			return -EPERM;  		} -	}  #endif +#ifdef CONFIG_SMP +		if (dl_bandwidth_enabled() && dl_policy(policy)) { +			cpumask_t *span = rq->rd->span; + +			/* +			 * Don't allow tasks with an affinity mask smaller than +			 * the entire root_domain to become SCHED_DEADLINE. We +			 * will also fail if there's no bandwidth available. +			 */ +			if (!cpumask_subset(span, &p->cpus_allowed) || +			    rq->rd->dl_bw.bw == 0) { +				task_rq_unlock(rq, p, &flags); +				return -EPERM; +			} +		} +#endif +	}  	/* recheck policy now with rq lock held */  	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3408,6 +3491,35 @@ recheck:  		task_rq_unlock(rq, p, &flags);  		goto recheck;  	} + +	/* +	 * If setscheduling to SCHED_DEADLINE (or changing the parameters +	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth +	 * is available. +	 */ +	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { +		task_rq_unlock(rq, p, &flags); +		return -EBUSY; +	} + +	p->sched_reset_on_fork = reset_on_fork; +	oldprio = p->prio; + +	/* +	 * Special case for priority boosted tasks. +	 * +	 * If the new priority is lower or equal (user space view) +	 * than the current (boosted) priority, we just store the new +	 * normal parameters and do not touch the scheduler class and +	 * the runqueue. This will be done when the task deboost +	 * itself. +	 */ +	if (rt_mutex_check_prio(p, newprio)) { +		__setscheduler_params(p, attr); +		task_rq_unlock(rq, p, &flags); +		return 0; +	} +  	on_rq = p->on_rq;  	running = task_current(rq, p);  	if (on_rq) @@ -3415,16 +3527,18 @@ recheck:  	if (running)  		p->sched_class->put_prev_task(rq, p); -	p->sched_reset_on_fork = reset_on_fork; - -	oldprio = p->prio;  	prev_class = p->sched_class; -	__setscheduler(rq, p, policy, param->sched_priority); +	__setscheduler(rq, p, attr);  	if (running)  		p->sched_class->set_curr_task(rq); -	if (on_rq) -		enqueue_task(rq, p, 0); +	if (on_rq) { +		/* +		 * We enqueue to tail when the priority of a task is +		 * increased (user space view). +		 */ +		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); +	}  	check_class_changed(rq, p, prev_class, oldprio);  	task_rq_unlock(rq, p, &flags); @@ -3434,6 +3548,26 @@ recheck:  	return 0;  } +static int _sched_setscheduler(struct task_struct *p, int policy, +			       const struct sched_param *param, bool check) +{ +	struct sched_attr attr = { +		.sched_policy   = policy, +		.sched_priority = param->sched_priority, +		.sched_nice	= PRIO_TO_NICE(p->static_prio), +	}; + +	/* +	 * Fixup the legacy SCHED_RESET_ON_FORK hack +	 */ +	if (policy & SCHED_RESET_ON_FORK) { +		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +		policy &= ~SCHED_RESET_ON_FORK; +		attr.sched_policy = policy; +	} + +	return __sched_setscheduler(p, &attr, check); +}  /**   * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.   * @p: the task in question. @@ -3447,10 +3581,16 @@ recheck:  int sched_setscheduler(struct task_struct *p, int policy,  		       const struct sched_param *param)  { -	return __sched_setscheduler(p, policy, param, true); +	return _sched_setscheduler(p, policy, param, true);  }  EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ +	return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); +  /**   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.   * @p: the task in question. @@ -3467,7 +3607,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);  int sched_setscheduler_nocheck(struct task_struct *p, int policy,  			       const struct sched_param *param)  { -	return __sched_setscheduler(p, policy, param, false); +	return _sched_setscheduler(p, policy, param, false);  }  static int @@ -3492,6 +3632,77 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)  	return retval;  } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, +			   struct sched_attr *attr) +{ +	u32 size; +	int ret; + +	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) +		return -EFAULT; + +	/* +	 * zero the full structure, so that a short copy will be nice. +	 */ +	memset(attr, 0, sizeof(*attr)); + +	ret = get_user(size, &uattr->size); +	if (ret) +		return ret; + +	if (size > PAGE_SIZE)	/* silly large */ +		goto err_size; + +	if (!size)		/* abi compat */ +		size = SCHED_ATTR_SIZE_VER0; + +	if (size < SCHED_ATTR_SIZE_VER0) +		goto err_size; + +	/* +	 * If we're handed a bigger struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. new +	 * user-space does not rely on any kernel feature +	 * extensions we dont know about yet. +	 */ +	if (size > sizeof(*attr)) { +		unsigned char __user *addr; +		unsigned char __user *end; +		unsigned char val; + +		addr = (void __user *)uattr + sizeof(*attr); +		end  = (void __user *)uattr + size; + +		for (; addr < end; addr++) { +			ret = get_user(val, addr); +			if (ret) +				return ret; +			if (val) +				goto err_size; +		} +		size = sizeof(*attr); +	} + +	ret = copy_from_user(attr, uattr, size); +	if (ret) +		return -EFAULT; + +	/* +	 * XXX: do we want to be lenient like existing syscalls; or do we want +	 * to be strict and return an error on out-of-bounds values? +	 */ +	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + +	return 0; + +err_size: +	put_user(sizeof(*attr), &uattr->size); +	return -E2BIG; +} +  /**   * sys_sched_setscheduler - set/change the scheduler policy and RT priority   * @pid: the pid in question. @@ -3523,6 +3734,39 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)  }  /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, +			       unsigned int, flags) +{ +	struct sched_attr attr; +	struct task_struct *p; +	int retval; + +	if (!uattr || pid < 0 || flags) +		return -EINVAL; + +	retval = sched_copy_attr(uattr, &attr); +	if (retval) +		return retval; + +	if ((int)attr.sched_policy < 0) +		return -EINVAL; + +	rcu_read_lock(); +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (p != NULL) +		retval = sched_setattr(p, &attr); +	rcu_read_unlock(); + +	return retval; +} + +/**   * sys_sched_getscheduler - get the policy (scheduling class) of a thread   * @pid: the pid in question.   * @@ -3560,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)   */  SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  { -	struct sched_param lp; +	struct sched_param lp = { .sched_priority = 0 };  	struct task_struct *p;  	int retval; @@ -3577,7 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)  	if (retval)  		goto out_unlock; -	lp.sched_priority = p->rt_priority; +	if (task_has_rt_policy(p)) +		lp.sched_priority = p->rt_priority;  	rcu_read_unlock();  	/* @@ -3592,19 +3837,103 @@ out_unlock:  	return retval;  } +static int sched_read_attr(struct sched_attr __user *uattr, +			   struct sched_attr *attr, +			   unsigned int usize) +{ +	int ret; + +	if (!access_ok(VERIFY_WRITE, uattr, usize)) +		return -EFAULT; + +	/* +	 * If we're handed a smaller struct than we know of, +	 * ensure all the unknown bits are 0 - i.e. old +	 * user-space does not get uncomplete information. +	 */ +	if (usize < sizeof(*attr)) { +		unsigned char *addr; +		unsigned char *end; + +		addr = (void *)attr + usize; +		end  = (void *)attr + sizeof(*attr); + +		for (; addr < end; addr++) { +			if (*addr) +				return -EFBIG; +		} + +		attr->size = usize; +	} + +	ret = copy_to_user(uattr, attr, attr->size); +	if (ret) +		return -EFAULT; + +	return 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, +		unsigned int, size, unsigned int, flags) +{ +	struct sched_attr attr = { +		.size = sizeof(struct sched_attr), +	}; +	struct task_struct *p; +	int retval; + +	if (!uattr || pid < 0 || size > PAGE_SIZE || +	    size < SCHED_ATTR_SIZE_VER0 || flags) +		return -EINVAL; + +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	retval = -ESRCH; +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	attr.sched_policy = p->policy; +	if (p->sched_reset_on_fork) +		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +	if (task_has_dl_policy(p)) +		__getparam_dl(p, &attr); +	else if (task_has_rt_policy(p)) +		attr.sched_priority = p->rt_priority; +	else +		attr.sched_nice = task_nice(p); + +	rcu_read_unlock(); + +	retval = sched_read_attr(uattr, &attr, size); +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} +  long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  {  	cpumask_var_t cpus_allowed, new_mask;  	struct task_struct *p;  	int retval; -	get_online_cpus();  	rcu_read_lock();  	p = find_process_by_pid(pid);  	if (!p) {  		rcu_read_unlock(); -		put_online_cpus();  		return -ESRCH;  	} @@ -3638,8 +3967,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	if (retval)  		goto out_unlock; +  	cpuset_cpus_allowed(p, cpus_allowed);  	cpumask_and(new_mask, in_mask, cpus_allowed); + +	/* +	 * Since bandwidth control happens on root_domain basis, +	 * if admission test is enabled, we only admit -deadline +	 * tasks allowed to run on all the CPUs in the task's +	 * root_domain. +	 */ +#ifdef CONFIG_SMP +	if (task_has_dl_policy(p)) { +		const struct cpumask *span = task_rq(p)->rd->span; + +		if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { +			retval = -EBUSY; +			goto out_unlock; +		} +	} +#endif  again:  	retval = set_cpus_allowed_ptr(p, new_mask); @@ -3661,7 +4008,6 @@ out_free_cpus_allowed:  	free_cpumask_var(cpus_allowed);  out_put_task:  	put_task_struct(p); -	put_online_cpus();  	return retval;  } @@ -3706,7 +4052,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  	unsigned long flags;  	int retval; -	get_online_cpus();  	rcu_read_lock();  	retval = -ESRCH; @@ -3719,12 +4064,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  		goto out_unlock;  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); +	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);  out_unlock:  	rcu_read_unlock(); -	put_online_cpus();  	return retval;  } @@ -3794,16 +4138,11 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } -static inline int should_resched(void) -{ -	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} -  static void __cond_resched(void)  { -	add_preempt_count(PREEMPT_ACTIVE); +	__preempt_count_add(PREEMPT_ACTIVE);  	__schedule(); -	sub_preempt_count(PREEMPT_ACTIVE); +	__preempt_count_sub(PREEMPT_ACTIVE);  }  int __sched _cond_resched(void) @@ -3902,7 +4241,7 @@ EXPORT_SYMBOL(yield);   *	false (0) if we failed to boost the target.   *	-ESRCH if there's no task to yield to.   */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt)  {  	struct task_struct *curr = current;  	struct rq *rq, *p_rq; @@ -3924,7 +4263,7 @@ again:  	}  	double_rq_lock(rq, p_rq); -	while (task_rq(p) != p_rq) { +	if (task_rq(p) != p_rq) {  		double_rq_unlock(rq, p_rq);  		goto again;  	} @@ -4013,6 +4352,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)  	case SCHED_RR:  		ret = MAX_USER_RT_PRIO-1;  		break; +	case SCHED_DEADLINE:  	case SCHED_NORMAL:  	case SCHED_BATCH:  	case SCHED_IDLE: @@ -4039,6 +4379,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)  	case SCHED_RR:  		ret = 1;  		break; +	case SCHED_DEADLINE:  	case SCHED_NORMAL:  	case SCHED_BATCH:  	case SCHED_IDLE: @@ -4082,7 +4423,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,  		goto out_unlock;  	rq = task_rq_lock(p, &flags); -	time_slice = p->sched_class->get_rr_interval(rq, p); +	time_slice = 0; +	if (p->sched_class->get_rr_interval) +		time_slice = p->sched_class->get_rr_interval(rq, p);  	task_rq_unlock(rq, p, &flags);  	rcu_read_unlock(); @@ -4186,7 +4529,7 @@ void init_idle(struct task_struct *idle, int cpu)  	raw_spin_lock_irqsave(&rq->lock, flags); -	__sched_fork(idle); +	__sched_fork(0, idle);  	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); @@ -4206,13 +4549,14 @@ void init_idle(struct task_struct *idle, int cpu)  	rcu_read_unlock();  	rq->curr = rq->idle = idle; +	idle->on_rq = 1;  #if defined(CONFIG_SMP)  	idle->on_cpu = 1;  #endif  	raw_spin_unlock_irqrestore(&rq->lock, flags);  	/* Set the preempt count _outside_ the spinlocks! */ -	task_thread_info(idle)->preempt_count = 0; +	init_idle_preempt_count(idle, cpu);  	/*  	 * The idle tasks have their own, simple scheduling class: @@ -4346,6 +4690,54 @@ fail:  	return ret;  } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ +	struct migration_arg arg = { p, target_cpu }; +	int curr_cpu = task_cpu(p); + +	if (curr_cpu == target_cpu) +		return 0; + +	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) +		return -EINVAL; + +	/* TODO: This is not properly updating schedstats */ + +	trace_sched_move_numa(p, curr_cpu, target_cpu); +	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ +	struct rq *rq; +	unsigned long flags; +	bool on_rq, running; + +	rq = task_rq_lock(p, &flags); +	on_rq = p->on_rq; +	running = task_current(rq, p); + +	if (on_rq) +		dequeue_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	p->numa_preferred_nid = nid; + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, p, 0); +	task_rq_unlock(rq, p, &flags); +} +#endif +  /*   * migration_cpu_stop - this will be executed by a highprio stopper thread   * and performs thread migration by bumping thread off CPU then @@ -4377,8 +4769,10 @@ void idle_task_exit(void)  	BUG_ON(cpu_online(smp_processor_id())); -	if (mm != &init_mm) +	if (mm != &init_mm) {  		switch_mm(mm, &init_mm, current); +		finish_arch_post_lock_switch(); +	}  	mmdrop(mm);  } @@ -4396,6 +4790,22 @@ static void calc_load_migrate(struct rq *rq)  		atomic_long_add(delta, &calc_load_tasks);  } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { +	.put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { +	/* +	 * Avoid pull_{rt,dl}_task() +	 */ +	.prio = MAX_PRIO + 1, +	.sched_class = &fake_sched_class, +}; +  /*   * Migrate all tasks from the rq, sleeping tasks will be migrated by   * try_to_wake_up()->select_task_rq(). @@ -4436,7 +4846,7 @@ static void migrate_tasks(unsigned int dead_cpu)  		if (rq->nr_running == 1)  			break; -		next = pick_next_task(rq); +		next = pick_next_task(rq, &fake_task);  		BUG_ON(!next);  		next->sched_class->put_prev_task(rq, next); @@ -4526,7 +4936,7 @@ set_table_entry(struct ctl_table *entry,  static struct ctl_table *  sd_alloc_ctl_domain_table(struct sched_domain *sd)  { -	struct ctl_table *table = sd_alloc_ctl_entry(13); +	struct ctl_table *table = sd_alloc_ctl_entry(14);  	if (table == NULL)  		return NULL; @@ -4554,9 +4964,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)  		sizeof(int), 0644, proc_dointvec_minmax, false);  	set_table_entry(&table[10], "flags", &sd->flags,  		sizeof(int), 0644, proc_dointvec_minmax, false); -	set_table_entry(&table[11], "name", sd->name, +	set_table_entry(&table[11], "max_newidle_lb_cost", +		&sd->max_newidle_lb_cost, +		sizeof(long), 0644, proc_doulongvec_minmax, false); +	set_table_entry(&table[12], "name", sd->name,  		CORENAME_MAX_SIZE, 0444, proc_dostring, false); -	/* &table[12] is terminator */ +	/* &table[13] is terminator */  	return table;  } @@ -4722,11 +5135,20 @@ static struct notifier_block migration_notifier = {  	.priority = CPU_PRI_MIGRATION,  }; +static void __cpuinit set_cpu_rq_start_time(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	rq->age_stamp = sched_clock_cpu(cpu); +} +  static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_STARTING: +		set_cpu_rq_start_time(); +		return NOTIFY_OK;  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -4738,13 +5160,31 @@ static int sched_cpu_active(struct notifier_block *nfb,  static int sched_cpu_inactive(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  { +	unsigned long flags; +	long cpu = (long)hcpu; +  	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_DOWN_PREPARE: -		set_cpu_active((long)hcpu, false); +		set_cpu_active(cpu, false); + +		/* explicitly allow suspend */ +		if (!(action & CPU_TASKS_FROZEN)) { +			struct dl_bw *dl_b = dl_bw_of(cpu); +			bool overflow; +			int cpus; + +			raw_spin_lock_irqsave(&dl_b->lock, flags); +			cpus = dl_bw_cpus(cpu); +			overflow = __dl_overflow(dl_b, cpus, 0, 0); +			raw_spin_unlock_irqrestore(&dl_b->lock, flags); + +			if (overflow) +				return notifier_from_errno(-EBUSY); +		}  		return NOTIFY_OK; -	default: -		return NOTIFY_DONE;  	} + +	return NOTIFY_DONE;  }  static int __init migration_init(void) @@ -4827,14 +5267,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  		}  		/* -		 * Even though we initialize ->power to something semi-sane, -		 * we leave power_orig unset. This allows us to detect if +		 * Even though we initialize ->capacity to something semi-sane, +		 * we leave capacity_orig unset. This allows us to detect if  		 * domain iteration is still funny without causing /0 traps.  		 */ -		if (!group->sgp->power_orig) { +		if (!group->sgc->capacity_orig) {  			printk(KERN_CONT "\n"); -			printk(KERN_ERR "ERROR: domain->cpu_power not " -					"set\n"); +			printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");  			break;  		} @@ -4856,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));  		printk(KERN_CONT " %s", str); -		if (group->sgp->power != SCHED_POWER_SCALE) { -			printk(KERN_CONT " (cpu_power = %d)", -				group->sgp->power); +		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { +			printk(KERN_CONT " (cpu_capacity = %d)", +				group->sgc->capacity);  		}  		group = group->next; @@ -4916,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd)  			 SD_BALANCE_NEWIDLE |  			 SD_BALANCE_FORK |  			 SD_BALANCE_EXEC | -			 SD_SHARE_CPUPOWER | -			 SD_SHARE_PKG_RESOURCES)) { +			 SD_SHARE_CPUCAPACITY | +			 SD_SHARE_PKG_RESOURCES | +			 SD_SHARE_POWERDOMAIN)) {  		if (sd->groups != sd->groups->next)  			return 0;  	} @@ -4946,9 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  				SD_BALANCE_NEWIDLE |  				SD_BALANCE_FORK |  				SD_BALANCE_EXEC | -				SD_SHARE_CPUPOWER | +				SD_SHARE_CPUCAPACITY |  				SD_SHARE_PKG_RESOURCES | -				SD_PREFER_SIBLING); +				SD_PREFER_SIBLING | +				SD_SHARE_POWERDOMAIN);  		if (nr_node_ids == 1)  			pflags &= ~SD_SERIALIZE;  	} @@ -4963,6 +5404,8 @@ static void free_rootdomain(struct rcu_head *rcu)  	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);  	cpupri_cleanup(&rd->cpupri); +	cpudl_cleanup(&rd->cpudl); +	free_cpumask_var(rd->dlo_mask);  	free_cpumask_var(rd->rto_mask);  	free_cpumask_var(rd->online);  	free_cpumask_var(rd->span); @@ -4985,7 +5428,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)  		cpumask_clear_cpu(rq->cpu, old_rd->span);  		/* -		 * If we dont want to free the old_rt yet then +		 * If we dont want to free the old_rd yet then  		 * set old_rd to NULL to skip the freeing later  		 * in this function:  		 */ @@ -5014,8 +5457,14 @@ static int init_rootdomain(struct root_domain *rd)  		goto out;  	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))  		goto free_span; -	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))  		goto free_online; +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +		goto free_dlo_mask; + +	init_dl_bw(&rd->dl_bw); +	if (cpudl_init(&rd->cpudl) != 0) +		goto free_dlo_mask;  	if (cpupri_init(&rd->cpupri) != 0)  		goto free_rto_mask; @@ -5023,6 +5472,8 @@ static int init_rootdomain(struct root_domain *rd)  free_rto_mask:  	free_cpumask_var(rd->rto_mask); +free_dlo_mask: +	free_cpumask_var(rd->dlo_mask);  free_online:  	free_cpumask_var(rd->online);  free_span: @@ -5060,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void)  	return rd;  } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc)  {  	struct sched_group *tmp, *first; @@ -5071,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)  	do {  		tmp = sg->next; -		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) -			kfree(sg->sgp); +		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) +			kfree(sg->sgc);  		kfree(sg);  		sg = tmp; @@ -5090,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu)  	if (sd->flags & SD_OVERLAP) {  		free_sched_groups(sd->groups, 1);  	} else if (atomic_dec_and_test(&sd->groups->ref)) { -		kfree(sd->groups->sgp); +		kfree(sd->groups->sgc);  		kfree(sd->groups);  	}  	kfree(sd); @@ -5119,10 +5570,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)  DEFINE_PER_CPU(struct sched_domain *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_size);  DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym);  static void update_top_cache_domain(int cpu)  {  	struct sched_domain *sd; +	struct sched_domain *busy_sd = NULL;  	int id = cpu;  	int size = 1; @@ -5130,11 +5585,19 @@ static void update_top_cache_domain(int cpu)  	if (sd) {  		id = cpumask_first(sched_domain_span(sd));  		size = cpumask_weight(sched_domain_span(sd)); +		busy_sd = sd->parent; /* sd_busy */  	} +	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);  	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);  	per_cpu(sd_llc_size, cpu) = size;  	per_cpu(sd_llc_id, cpu) = id; + +	sd = lowest_flag_domain(cpu, SD_NUMA); +	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + +	sd = highest_flag_domain(cpu, SD_ASYM_PACKING); +	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);  }  /* @@ -5200,17 +5663,6 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ -	return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { -	struct sched_domain **__percpu sd; -	struct sched_group **__percpu sg; -	struct sched_group_power **__percpu sgp; -}; -  struct s_data {  	struct sched_domain ** __percpu sd;  	struct root_domain	*rd; @@ -5223,21 +5675,6 @@ enum s_alloc {  	sa_none,  }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP	0x01 - -struct sched_domain_topology_level { -	sched_domain_init_f init; -	sched_domain_mask_f mask; -	int		    flags; -	int		    numa_level; -	struct sd_data      data; -}; -  /*   * Build an iteration mask that can exclude certain CPUs from the upwards   * domain traversal. @@ -5315,16 +5752,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  		cpumask_or(covered, covered, sg_span); -		sg->sgp = *per_cpu_ptr(sdd->sgp, i); -		if (atomic_inc_return(&sg->sgp->ref) == 1) +		sg->sgc = *per_cpu_ptr(sdd->sgc, i); +		if (atomic_inc_return(&sg->sgc->ref) == 1)  			build_group_mask(sd, sg);  		/* -		 * Initialize sgp->power such that even if we mess up the +		 * Initialize sgc->capacity such that even if we mess up the  		 * domains and no possible iteration will get us here, we won't  		 * die on a /0 trap.  		 */ -		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); +		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); +		sg->sgc->capacity_orig = sg->sgc->capacity;  		/*  		 * Make sure the first group of this domain contains the @@ -5362,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)  	if (sg) {  		*sg = *per_cpu_ptr(sdd->sg, cpu); -		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); -		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ +		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); +		atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */  	}  	return cpu; @@ -5372,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)  /*   * build_sched_groups will build a circular linked list of the groups   * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0.   *   * Assumes the sched_domain tree is fully constructed   */ @@ -5404,8 +5842,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)  			continue;  		group = get_group(i, sdd, &sg); -		cpumask_clear(sched_group_cpus(sg)); -		sg->sgp->power = 0;  		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) { @@ -5428,16 +5864,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)  }  /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity.   * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while   * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity.   */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)  {  	struct sched_group *sg = sd->groups; @@ -5451,13 +5887,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  	if (cpu != group_balance_cpu(sg))  		return; -	update_group_power(sd, cpu); -	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ -       return 0*SD_ASYM_PACKING; +	update_group_capacity(sd, cpu); +	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);  }  /* @@ -5465,34 +5896,6 @@ int __weak arch_sd_sibling_asym_packing(void)   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()   */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type)		sd->name = #type -#else -# define SD_INIT_NAME(sd, type)		do { } while (0) -#endif - -#define SD_INIT_FUNC(type)						\ -static noinline struct sched_domain *					\ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ -{									\ -	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ -	*sd = SD_##type##_INIT;						\ -	SD_INIT_NAME(sd, type);						\ -	sd->private = &tl->data;					\ -	return sd;							\ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif -  static int default_relax_domain_level = -1;  int sched_domain_level_max; @@ -5576,100 +5979,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)  	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))  		*per_cpu_ptr(sdd->sg, cpu) = NULL; -	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) -		*per_cpu_ptr(sdd->sgp, cpu) = NULL; -} - -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ -	return topology_thread_cpumask(cpu); +	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) +		*per_cpu_ptr(sdd->sgc, cpu) = NULL;  } -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT -	{ sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC -	{ sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK -	{ sd_init_BOOK, cpu_book_mask, }, -#endif -	{ sd_init_CPU, cpu_cpu_mask, }, -	{ NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl)			\ -	for (tl = sched_domain_topology; tl->init; tl++)  #ifdef CONFIG_NUMA -  static int sched_domains_numa_levels;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ -	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) -		return 0; - -	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUCAPACITY      - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA                - describes NUMA topologies + * SD_SHARE_POWERDOMAIN   - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING        - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS		\ +	(SD_SHARE_CPUCAPACITY |		\ +	 SD_SHARE_PKG_RESOURCES |	\ +	 SD_NUMA |			\ +	 SD_ASYM_PACKING |		\ +	 SD_SHARE_POWERDOMAIN)  static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu)  {  	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); -	int level = tl->numa_level; -	int sd_weight = cpumask_weight( -			sched_domains_numa_masks[level][cpu_to_node(cpu)]); +	int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA +	/* +	 * Ugly hack to pass state to sd_numa_mask()... +	 */ +	sched_domains_curr_level = tl->numa_level; +#endif + +	sd_weight = cpumask_weight(tl->mask(cpu)); + +	if (tl->sd_flags) +		sd_flags = (*tl->sd_flags)(); +	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, +			"wrong sd_flags in topology description\n")) +		sd_flags &= ~TOPOLOGY_SD_FLAGS;  	*sd = (struct sched_domain){  		.min_interval		= sd_weight,  		.max_interval		= 2*sd_weight,  		.busy_factor		= 32,  		.imbalance_pct		= 125, -		.cache_nice_tries	= 2, -		.busy_idx		= 3, -		.idle_idx		= 2, + +		.cache_nice_tries	= 0, +		.busy_idx		= 0, +		.idle_idx		= 0,  		.newidle_idx		= 0,  		.wake_idx		= 0,  		.forkexec_idx		= 0,  		.flags			= 1*SD_LOAD_BALANCE  					| 1*SD_BALANCE_NEWIDLE -					| 0*SD_BALANCE_EXEC -					| 0*SD_BALANCE_FORK +					| 1*SD_BALANCE_EXEC +					| 1*SD_BALANCE_FORK  					| 0*SD_BALANCE_WAKE -					| 0*SD_WAKE_AFFINE -					| 0*SD_SHARE_CPUPOWER +					| 1*SD_WAKE_AFFINE +					| 0*SD_SHARE_CPUCAPACITY  					| 0*SD_SHARE_PKG_RESOURCES -					| 1*SD_SERIALIZE +					| 0*SD_SERIALIZE  					| 0*SD_PREFER_SIBLING -					| sd_local_flags(level) +					| 0*SD_NUMA +					| sd_flags  					, +  		.last_balance		= jiffies,  		.balance_interval	= sd_weight, +		.smt_gain		= 0, +		.max_newidle_lb_cost	= 0, +		.next_decay_max_lb_cost	= jiffies, +#ifdef CONFIG_SCHED_DEBUG +		.name			= tl->name, +#endif  	}; -	SD_INIT_NAME(sd, NUMA); -	sd->private = &tl->data;  	/* -	 * Ugly hack to pass state to sd_numa_mask()... +	 * Convert topological properties into behaviour.  	 */ -	sched_domains_curr_level = tl->numa_level; + +	if (sd->flags & SD_SHARE_CPUCAPACITY) { +		sd->imbalance_pct = 110; +		sd->smt_gain = 1178; /* ~15% */ + +	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { +		sd->imbalance_pct = 117; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; + +#ifdef CONFIG_NUMA +	} else if (sd->flags & SD_NUMA) { +		sd->cache_nice_tries = 2; +		sd->busy_idx = 3; +		sd->idle_idx = 2; + +		sd->flags |= SD_SERIALIZE; +		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { +			sd->flags &= ~(SD_BALANCE_EXEC | +				       SD_BALANCE_FORK | +				       SD_WAKE_AFFINE); +		} + +#endif +	} else { +		sd->flags |= SD_PREFER_SIBLING; +		sd->cache_nice_tries = 1; +		sd->busy_idx = 2; +		sd->idle_idx = 1; +	} + +	sd->private = &tl->data;  	return sd;  } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC +	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) }, +	{ NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl)			\ +	for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ +	sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA +  static const struct cpumask *sd_numa_mask(int cpu)  {  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -5813,7 +6274,10 @@ static void sched_init_numa(void)  		}  	} -	tl = kzalloc((ARRAY_SIZE(default_topology) + level) * +	/* Compute default topology size */ +	for (i = 0; sched_domain_topology[i].mask; i++); + +	tl = kzalloc((i + level + 1) *  			sizeof(struct sched_domain_topology_level), GFP_KERNEL);  	if (!tl)  		return; @@ -5821,18 +6285,19 @@ static void sched_init_numa(void)  	/*  	 * Copy the default topology bits..  	 */ -	for (i = 0; default_topology[i].init; i++) -		tl[i] = default_topology[i]; +	for (i = 0; sched_domain_topology[i].mask; i++) +		tl[i] = sched_domain_topology[i];  	/*  	 * .. and append 'j' levels of NUMA goodness.  	 */  	for (j = 0; j < level; i++, j++) {  		tl[i] = (struct sched_domain_topology_level){ -			.init = sd_numa_init,  			.mask = sd_numa_mask, +			.sd_flags = cpu_numa_flags,  			.flags = SDTL_OVERLAP,  			.numa_level = j, +			SD_INIT_NAME(NUMA)  		};  	} @@ -5917,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  		if (!sdd->sg)  			return -ENOMEM; -		sdd->sgp = alloc_percpu(struct sched_group_power *); -		if (!sdd->sgp) +		sdd->sgc = alloc_percpu(struct sched_group_capacity *); +		if (!sdd->sgc)  			return -ENOMEM;  		for_each_cpu(j, cpu_map) {  			struct sched_domain *sd;  			struct sched_group *sg; -			struct sched_group_power *sgp; +			struct sched_group_capacity *sgc;  		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j)); @@ -5942,12 +6407,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  			*per_cpu_ptr(sdd->sg, j) = sg; -			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), +			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j)); -			if (!sgp) +			if (!sgc)  				return -ENOMEM; -			*per_cpu_ptr(sdd->sgp, j) = sgp; +			*per_cpu_ptr(sdd->sgc, j) = sgc;  		}  	} @@ -5974,15 +6439,15 @@ static void __sdt_free(const struct cpumask *cpu_map)  			if (sdd->sg)  				kfree(*per_cpu_ptr(sdd->sg, j)); -			if (sdd->sgp) -				kfree(*per_cpu_ptr(sdd->sgp, j)); +			if (sdd->sgc) +				kfree(*per_cpu_ptr(sdd->sgc, j));  		}  		free_percpu(sdd->sd);  		sdd->sd = NULL;  		free_percpu(sdd->sg);  		sdd->sg = NULL; -		free_percpu(sdd->sgp); -		sdd->sgp = NULL; +		free_percpu(sdd->sgc); +		sdd->sgc = NULL;  	}  } @@ -5990,7 +6455,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = tl->init(tl, cpu); +	struct sched_domain *sd = sd_init(tl, cpu);  	if (!sd)  		return child; @@ -6052,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,  		}  	} -	/* Calculate CPU power for physical packages and nodes */ +	/* Calculate CPU capacity for physical packages and nodes */  	for (i = nr_cpumask_bits-1; i >= 0; i--) {  		if (!cpumask_test_cpu(i, cpu_map))  			continue;  		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {  			claim_allocations(i, sd); -			init_sched_groups_power(i, sd); +			init_sched_groups_capacity(i, sd);  		}  	} @@ -6094,7 +6559,7 @@ static cpumask_var_t fallback_doms;   * cpu core maps. It is supposed to return 1 if the topology changed   * or 0 if it stayed the same.   */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void)  {  	return 0;  } @@ -6335,14 +6800,17 @@ void __init sched_init_smp(void)  	sched_init_numa(); -	get_online_cpus(); +	/* +	 * There's no userspace yet to cause hotplug operations; hence all the +	 * cpu masks are stable and all blatant races in the below code cannot +	 * happen. +	 */  	mutex_lock(&sched_domains_mutex);  	init_sched_domains(cpu_active_mask);  	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);  	if (cpumask_empty(non_isolated_cpus))  		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);  	mutex_unlock(&sched_domains_mutex); -	put_online_cpus();  	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);  	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -6357,6 +6825,7 @@ void __init sched_init_smp(void)  	free_cpumask_var(non_isolated_cpus);  	init_sched_rt_class(); +	init_sched_dl_class();  }  #else  void __init sched_init_smp(void) @@ -6426,13 +6895,15 @@ void __init sched_init(void)  #endif /* CONFIG_CPUMASK_OFFSTACK */  	} +	init_rt_bandwidth(&def_rt_bandwidth, +			global_rt_period(), global_rt_runtime()); +	init_dl_bandwidth(&def_dl_bandwidth, +			global_rt_period(), global_rt_runtime()); +  #ifdef CONFIG_SMP  	init_defrootdomain();  #endif -	init_rt_bandwidth(&def_rt_bandwidth, -			global_rt_period(), global_rt_runtime()); -  #ifdef CONFIG_RT_GROUP_SCHED  	init_rt_bandwidth(&root_task_group.rt_bandwidth,  			global_rt_period(), global_rt_runtime()); @@ -6456,6 +6927,7 @@ void __init sched_init(void)  		rq->calc_load_update = jiffies + LOAD_FREQ;  		init_cfs_rq(&rq->cfs);  		init_rt_rq(&rq->rt, rq); +		init_dl_rq(&rq->dl, rq);  #ifdef CONFIG_FAIR_GROUP_SCHED  		root_task_group.shares = ROOT_TASK_GROUP_LOAD;  		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6484,7 +6956,6 @@ void __init sched_init(void)  		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;  #ifdef CONFIG_RT_GROUP_SCHED -		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);  		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);  #endif @@ -6496,7 +6967,7 @@ void __init sched_init(void)  #ifdef CONFIG_SMP  		rq->sd = NULL;  		rq->rd = NULL; -		rq->cpu_power = SCHED_POWER_SCALE; +		rq->cpu_capacity = SCHED_CAPACITY_SCALE;  		rq->post_schedule = 0;  		rq->active_balance = 0;  		rq->next_balance = jiffies; @@ -6505,6 +6976,7 @@ void __init sched_init(void)  		rq->online = 0;  		rq->idle_stamp = 0;  		rq->avg_idle = 2*sysctl_sched_migration_cost; +		rq->max_idle_balance_cost = sysctl_sched_migration_cost;  		INIT_LIST_HEAD(&rq->cfs_tasks); @@ -6526,10 +6998,6 @@ void __init sched_init(void)  	INIT_HLIST_HEAD(&init_task.preempt_notifiers);  #endif -#ifdef CONFIG_RT_MUTEXES -	plist_head_init(&init_task.pi_waiters); -#endif -  	/*  	 * The boot idle thread does lazy MMU switching as well:  	 */ @@ -6557,6 +7025,7 @@ void __init sched_init(void)  	if (cpu_isolated_map == NULL)  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  	idle_thread_set_boot_cpu(); +	set_cpu_rq_start_time();  #endif  	init_sched_fair_class(); @@ -6576,7 +7045,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)  	static unsigned long prev_jiffy;	/* ratelimiting */  	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ -	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || +	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && +	     !is_idle_task(current)) ||  	    system_state != SYSTEM_RUNNING || oops_in_progress)  		return;  	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6594,6 +7064,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)  	debug_show_held_locks(current);  	if (irqs_disabled())  		print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT +	if (!preempt_count_equals(preempt_offset)) { +		pr_err("Preemption disabled at:"); +		print_ip_sym(current->preempt_disable_ip); +		pr_cont("\n"); +	} +#endif  	dump_stack();  }  EXPORT_SYMBOL(__might_sleep); @@ -6603,13 +7080,16 @@ EXPORT_SYMBOL(__might_sleep);  static void normalize_task(struct rq *rq, struct task_struct *p)  {  	const struct sched_class *prev_class = p->sched_class; +	struct sched_attr attr = { +		.sched_policy = SCHED_NORMAL, +	};  	int old_prio = p->prio;  	int on_rq;  	on_rq = p->on_rq;  	if (on_rq)  		dequeue_task(rq, p, 0); -	__setscheduler(rq, p, SCHED_NORMAL, 0); +	__setscheduler(rq, p, &attr);  	if (on_rq) {  		enqueue_task(rq, p, 0);  		resched_task(rq->curr); @@ -6639,12 +7119,12 @@ void normalize_rt_tasks(void)  		p->se.statistics.block_start	= 0;  #endif -		if (!rt_task(p)) { +		if (!dl_task(p) && !rt_task(p)) {  			/*  			 * Renice negative nice level userspace  			 * tasks back to 0:  			 */ -			if (TASK_NICE(p) < 0 && p->mm) +			if (task_nice(p) < 0 && p->mm)  				set_user_nice(p, 0);  			continue;  		} @@ -6812,7 +7292,7 @@ void sched_move_task(struct task_struct *tsk)  	if (unlikely(running))  		tsk->sched_class->put_prev_task(rq, tsk); -	tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, +	tg = container_of(task_css_check(tsk, cpu_cgrp_id,  				lockdep_is_held(&tsk->sighand->siglock)),  			  struct task_group, css);  	tg = autogroup_task_group(tsk, tg); @@ -6834,16 +7314,6 @@ void sched_move_task(struct task_struct *tsk)  }  #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ -	if (runtime == RUNTIME_INF) -		return 1ULL << 20; - -	return div64_u64(runtime << 20, period); -} -#endif -  #ifdef CONFIG_RT_GROUP_SCHED  /*   * Ensure that the real time constraints are schedulable. @@ -7017,24 +7487,13 @@ static long sched_group_rt_period(struct task_group *tg)  	do_div(rt_period_us, NSEC_PER_USEC);  	return rt_period_us;  } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED  static int sched_rt_global_constraints(void)  { -	u64 runtime, period;  	int ret = 0; -	if (sysctl_sched_rt_period <= 0) -		return -EINVAL; - -	runtime = global_rt_runtime(); -	period = global_rt_period(); - -	/* -	 * Sanity check on the sysctl variables. -	 */ -	if (runtime > period && runtime != RUNTIME_INF) -		return -EINVAL; -  	mutex_lock(&rt_constraints_mutex);  	read_lock(&tasklist_lock);  	ret = __rt_schedulable(NULL, 0, 0); @@ -7057,17 +7516,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)  static int sched_rt_global_constraints(void)  {  	unsigned long flags; -	int i; - -	if (sysctl_sched_rt_period <= 0) -		return -EINVAL; - -	/* -	 * There's always some RT tasks in the root group -	 * -- migration, kstopmachine etc.. -	 */ -	if (sysctl_sched_rt_runtime == 0) -		return -EBUSY; +	int i, ret = 0;  	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);  	for_each_possible_cpu(i) { @@ -7079,36 +7528,91 @@ static int sched_rt_global_constraints(void)  	}  	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); -	return 0; +	return ret;  }  #endif /* CONFIG_RT_GROUP_SCHED */ -int sched_rr_handler(struct ctl_table *table, int write, -		void __user *buffer, size_t *lenp, -		loff_t *ppos) +static int sched_dl_global_constraints(void)  { -	int ret; -	static DEFINE_MUTEX(mutex); +	u64 runtime = global_rt_runtime(); +	u64 period = global_rt_period(); +	u64 new_bw = to_ratio(period, runtime); +	int cpu, ret = 0; +	unsigned long flags; -	mutex_lock(&mutex); -	ret = proc_dointvec(table, write, buffer, lenp, ppos); -	/* make sure that internally we keep jiffies */ -	/* also, writing zero resets timeslice to default */ -	if (!ret && write) { -		sched_rr_timeslice = sched_rr_timeslice <= 0 ? -			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	/* +	 * Here we want to check the bandwidth not being set to some +	 * value smaller than the currently allocated bandwidth in +	 * any of the root_domains. +	 * +	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than +	 * cycling on root_domains... Discussion on different/better +	 * solutions is welcome! +	 */ +	for_each_possible_cpu(cpu) { +		struct dl_bw *dl_b = dl_bw_of(cpu); + +		raw_spin_lock_irqsave(&dl_b->lock, flags); +		if (new_bw < dl_b->total_bw) +			ret = -EBUSY; +		raw_spin_unlock_irqrestore(&dl_b->lock, flags); + +		if (ret) +			break;  	} -	mutex_unlock(&mutex); +  	return ret;  } +static void sched_dl_do_global(void) +{ +	u64 new_bw = -1; +	int cpu; +	unsigned long flags; + +	def_dl_bandwidth.dl_period = global_rt_period(); +	def_dl_bandwidth.dl_runtime = global_rt_runtime(); + +	if (global_rt_runtime() != RUNTIME_INF) +		new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + +	/* +	 * FIXME: As above... +	 */ +	for_each_possible_cpu(cpu) { +		struct dl_bw *dl_b = dl_bw_of(cpu); + +		raw_spin_lock_irqsave(&dl_b->lock, flags); +		dl_b->bw = new_bw; +		raw_spin_unlock_irqrestore(&dl_b->lock, flags); +	} +} + +static int sched_rt_global_validate(void) +{ +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	if ((sysctl_sched_rt_runtime != RUNTIME_INF) && +		(sysctl_sched_rt_runtime > sysctl_sched_rt_period)) +		return -EINVAL; + +	return 0; +} + +static void sched_rt_do_global(void) +{ +	def_rt_bandwidth.rt_runtime = global_rt_runtime(); +	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} +  int sched_rt_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp,  		loff_t *ppos)  { -	int ret;  	int old_period, old_runtime;  	static DEFINE_MUTEX(mutex); +	int ret;  	mutex_lock(&mutex);  	old_period = sysctl_sched_rt_period; @@ -7117,21 +7621,50 @@ int sched_rt_handler(struct ctl_table *table, int write,  	ret = proc_dointvec(table, write, buffer, lenp, ppos);  	if (!ret && write) { +		ret = sched_rt_global_validate(); +		if (ret) +			goto undo; +  		ret = sched_rt_global_constraints(); -		if (ret) { -			sysctl_sched_rt_period = old_period; -			sysctl_sched_rt_runtime = old_runtime; -		} else { -			def_rt_bandwidth.rt_runtime = global_rt_runtime(); -			def_rt_bandwidth.rt_period = -				ns_to_ktime(global_rt_period()); -		} +		if (ret) +			goto undo; + +		ret = sched_dl_global_constraints(); +		if (ret) +			goto undo; + +		sched_rt_do_global(); +		sched_dl_do_global(); +	} +	if (0) { +undo: +		sysctl_sched_rt_period = old_period; +		sysctl_sched_rt_runtime = old_runtime;  	}  	mutex_unlock(&mutex);  	return ret;  } +int sched_rr_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret; +	static DEFINE_MUTEX(mutex); + +	mutex_lock(&mutex); +	ret = proc_dointvec(table, write, buffer, lenp, ppos); +	/* make sure that internally we keep jiffies */ +	/* also, writing zero resets timeslice to default */ +	if (!ret && write) { +		sched_rr_timeslice = sched_rr_timeslice <= 0 ? +			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); +	} +	mutex_unlock(&mutex); +	return ret; +} +  #ifdef CONFIG_CGROUP_SCHED  static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -7160,7 +7693,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct task_group *tg = css_tg(css); -	struct task_group *parent = css_tg(css_parent(css)); +	struct task_group *parent = css_tg(css->parent);  	if (parent)  		sched_online_group(tg, parent); @@ -7186,7 +7719,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,  {  	struct task_struct *task; -	cgroup_taskset_for_each(task, css, tset) { +	cgroup_taskset_for_each(task, tset) {  #ifdef CONFIG_RT_GROUP_SCHED  		if (!sched_rt_can_attach(css_tg(css), task))  			return -EINVAL; @@ -7204,7 +7737,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,  {  	struct task_struct *task; -	cgroup_taskset_for_each(task, css, tset) +	cgroup_taskset_for_each(task, tset)  		sched_move_task(task);  } @@ -7277,7 +7810,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	runtime_enabled = quota != RUNTIME_INF;  	runtime_was_enabled = cfs_b->quota != RUNTIME_INF; -	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); +	/* +	 * If we need to toggle cfs_bandwidth_used, off->on must occur +	 * before making related changes, and on->off must occur afterwards +	 */ +	if (runtime_enabled && !runtime_was_enabled) +		cfs_bandwidth_usage_inc();  	raw_spin_lock_irq(&cfs_b->lock);  	cfs_b->period = ns_to_ktime(period);  	cfs_b->quota = quota; @@ -7286,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	/* restart the period timer (if active) to handle new period expiry */  	if (runtime_enabled && cfs_b->timer_active) {  		/* force a reprogram */ -		cfs_b->timer_active = 0; -		__start_cfs_bandwidth(cfs_b); +		__start_cfs_bandwidth(cfs_b, true);  	}  	raw_spin_unlock_irq(&cfs_b->lock); @@ -7303,6 +7840,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  			unthrottle_cfs_rq(cfs_rq);  		raw_spin_unlock_irq(&rq->lock);  	} +	if (runtime_was_enabled && !runtime_enabled) +		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); @@ -7457,15 +7996,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)  	return ret;  } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, -		struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v)  { -	struct task_group *tg = css_tg(css); +	struct task_group *tg = css_tg(seq_css(sf));  	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; -	cb->fill(cb, "nr_periods", cfs_b->nr_periods); -	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); -	cb->fill(cb, "throttled_time", cfs_b->throttled_time); +	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); +	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); +	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);  	return 0;  } @@ -7519,7 +8057,7 @@ static struct cftype cpu_files[] = {  	},  	{  		.name = "stat", -		.read_map = cpu_stats_show, +		.seq_show = cpu_stats_show,  	},  #endif  #ifdef CONFIG_RT_GROUP_SCHED @@ -7537,8 +8075,7 @@ static struct cftype cpu_files[] = {  	{ }	/* terminate */  }; -struct cgroup_subsys cpu_cgroup_subsys = { -	.name		= "cpu", +struct cgroup_subsys cpu_cgrp_subsys = {  	.css_alloc	= cpu_cgroup_css_alloc,  	.css_free	= cpu_cgroup_css_free,  	.css_online	= cpu_cgroup_css_online, @@ -7546,7 +8083,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {  	.can_attach	= cpu_cgroup_can_attach,  	.attach		= cpu_cgroup_attach,  	.exit		= cpu_cgroup_exit, -	.subsys_id	= cpu_cgroup_subsys_id,  	.base_cftypes	= cpu_files,  	.early_init	= 1,  };  | 
