diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-12 10:20:12 +0900 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-12 10:20:12 +0900 |
commit | 39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch) | |
tree | 40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /kernel | |
parent | ad5d69899e52792671c1aa6c7360464c7edfe09c (diff) | |
parent | e5137b50a0640009fd63a3e65c14bc6e1be8796a (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The main changes in this cycle are:
- (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik
van Riel, Peter Zijlstra et al. Yay!
- optimize preemption counter handling: merge the NEED_RESCHED flag
into the preempt_count variable, by Peter Zijlstra.
- wait.h fixes and code reorganization from Peter Zijlstra
- cfs_bandwidth fixes from Ben Segall
- SMP load-balancer cleanups from Peter Zijstra
- idle balancer improvements from Jason Low
- other fixes and cleanups"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits)
ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
stop_machine: Fix race between stop_two_cpus() and stop_cpus()
sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus
sched: Fix asymmetric scheduling for POWER7
sched: Move completion code from core.c to completion.c
sched: Move wait code from core.c to wait.c
sched: Move wait.c into kernel/sched/
sched/wait: Fix __wait_event_interruptible_lock_irq_timeout()
sched: Avoid throttle_cfs_rq() racing with period_timer stopping
sched: Guarantee new group-entities always have weight
sched: Fix hrtimer_cancel()/rq->lock deadlock
sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining
sched: Fix race on toggling cfs_bandwidth_used
sched: Remove extra put_online_cpus() inside sched_setaffinity()
sched/rt: Fix task_tick_rt() comment
sched/wait: Fix build breakage
sched/wait: Introduce prepare_to_wait_event()
sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too
sched: Remove get_online_cpus() usage
sched: Fix race in migrate_swap_stop()
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/bounds.c | 4 | ||||
-rw-r--r-- | kernel/context_tracking.c | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 17 | ||||
-rw-r--r-- | kernel/cpu/idle.c | 16 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 15 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/completion.c | 299 | ||||
-rw-r--r-- | kernel/sched/core.c | 683 | ||||
-rw-r--r-- | kernel/sched/debug.c | 68 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1397 | ||||
-rw-r--r-- | kernel/sched/features.h | 19 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 22 | ||||
-rw-r--r-- | kernel/sched/sched.h | 54 | ||||
-rw-r--r-- | kernel/sched/stats.h | 46 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/wait.c (renamed from kernel/wait.c) | 127 | ||||
-rw-r--r-- | kernel/softirq.c | 16 | ||||
-rw-r--r-- | kernel/stop_machine.c | 303 | ||||
-rw-r--r-- | kernel/sysctl.c | 21 | ||||
-rw-r--r-- | kernel/timer.c | 8 | ||||
-rw-r--r-- | kernel/trace/trace.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace.h | 1 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 19 |
26 files changed, 2325 insertions, 827 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f99d908b555..a4d1aa8da9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o posix-timers.o \ - kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b..e8ca97b5c38 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,6 +10,7 @@ #include <linux/mmzone.h> #include <linux/kbuild.h> #include <linux/page_cgroup.h> +#include <linux/log2.h> void foo(void) { @@ -17,5 +18,8 @@ void foo(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); +#ifdef CONFIG_SMP + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); +#endif /* End of constants */ } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a..e5f3917aa05 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -void __sched notrace preempt_schedule_context(void) +asmlinkage void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a..63aa50d7ce1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } smpboot_park_threads(cpu); + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so explicitly call both. + */ +#ifdef CONFIG_PREEMPT + synchronize_sched(); +#endif + synchronize_rcu(); + + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb..988573a9a38 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!need_resched()) + while (!tif_need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -92,8 +92,7 @@ static void cpu_idle_loop(void) if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { - current_clr_polling(); - if (!need_resched()) { + if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); @@ -103,9 +102,16 @@ static void cpu_idle_loop(void) } else { local_irq_enable(); } - current_set_polling(); + __current_set_polling(); } arch_cpu_idle_exit(); + /* + * We need to test and propagate the TIF_NEED_RESCHED + * bit here because we might not have send the + * reschedule IPI to idle tasks. + */ + if (tif_need_resched()) + set_preempt_need_resched(); } tick_nohz_idle_exit(); schedule_preempt_disabled(); @@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - current_set_polling(); + __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } diff --git a/kernel/fork.c b/kernel/fork.c index 8531609b6a8..f6d11fc67f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif -#ifdef CONFIG_NUMA_BALANCING - mm->first_nid = NUMA_PTE_SCAN_INIT; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p); + sched_fork(clone_flags, p); retval = perf_event_init_task(p); if (retval) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8a2c81e86dd..4c06ddfea7c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -916,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -945,7 +951,14 @@ static void print_cpu_stall(struct rcu_state *rsp) 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + resched_cpu(smp_processor_id()); } static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f49..7b621409cf1 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += wait.o completion.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 00000000000..a63f4dc2790 --- /dev/null +++ b/kernel/sched/completion.c @@ -0,0 +1,299 @@ +/* + * Generic wait-for-completion handler; + * + * It differs from semaphores in that their default case is the opposite, + * wait_for_completion default blocks whereas semaphore default non-block. The + * interface also makes it easy to 'complete' multiple waiting threads, + * something which isn't entirely natural for semaphores. + * + * But more importantly, the primitive documents the usage. Semaphores would + * typically be used for exclusion which gives rise to priority inversion. + * Waiting for completion is a typically sync point, but not an exclusion point. + */ + +#include <linux/sched.h> +#include <linux/completion.h> + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_locked(&x->wait, TASK_NORMAL, 1); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_locked(&x->wait, TASK_NORMAL, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = action(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, action, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Return: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Return: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995..1deccd78be9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -513,12 +513,11 @@ static inline void init_hrtick(void) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -#ifdef CONFIG_SMP void resched_task(struct task_struct *p) { int cpu; - assert_raw_spin_locked(&task_rq(p)->lock); + lockdep_assert_held(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + set_preempt_need_resched(); return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -546,6 +547,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers @@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) } } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(p); + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(p); + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_preempt_count(p) & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ + if (p->on_rq) { + struct rq *src_rq, *dst_rq; + + src_rq = task_rq(p); + dst_rq = cpu_rq(cpu); + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); + check_preempt_curr(dst_rq, p, 0); + } else { + /* + * Task isn't running anymore; make it appear like we migrated + * it before it went to sleep. This means on wakeup we make the + * previous cpu our targer instead of where it really is. + */ + p->wake_cpu = cpu; + } +} + +struct migration_swap_arg { + struct task_struct *src_task, *dst_task; + int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ + struct migration_swap_arg *arg = data; + struct rq *src_rq, *dst_rq; + int ret = -EAGAIN; + + src_rq = cpu_rq(arg->src_cpu); + dst_rq = cpu_rq(arg->dst_cpu); + + double_raw_lock(&arg->src_task->pi_lock, + &arg->dst_task->pi_lock); + double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) + goto unlock; + + if (task_cpu(arg->src_task) != arg->src_cpu) + goto unlock; + + if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + goto unlock; + + if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + goto unlock; + + __migrate_swap_task(arg->src_task, arg->dst_cpu); + __migrate_swap_task(arg->dst_task, arg->src_cpu); + + ret = 0; + +unlock: + double_rq_unlock(src_rq, dst_rq); + raw_spin_unlock(&arg->dst_task->pi_lock); + raw_spin_unlock(&arg->src_task->pi_lock); + + return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ + struct migration_swap_arg arg; + int ret = -EINVAL; + + arg = (struct migration_swap_arg){ + .src_task = cur, + .src_cpu = task_cpu(cur), + .dst_task = p, + .dst_cpu = task_cpu(p), + }; + + if (arg.src_cpu == arg.dst_cpu) + goto out; + + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ + if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) + goto out; + + if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + goto out; + + if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + goto out; + + ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: + return ret; +} + struct migration_arg { struct task_struct *task; int dest_cpu; @@ -1236,9 +1333,9 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + u64 max = 2*rq->max_idle_balance_cost; + + update_avg(&rq->avg_idle, delta); - if (delta > max) + if (rq->avg_idle > max) rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; } #endif @@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + if (tif_need_resched()) + set_preempt_need_resched(); + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies; - p->mm->numa_next_reset = jiffies; + p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); p->mm->numa_scan_seq = 0; } + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; + p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->numa_faults_buffer = NULL; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p) +void sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif + init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif @@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p) * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ - set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); + set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif /* Initialize new task's runnable average */ @@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); - sched_info_switch(prev, next); + sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); @@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { + task_numa_free(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -2073,7 +2186,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void __kprobes preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((pr |