aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c362
1 files changed, 273 insertions, 89 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda71..7f12624a393 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -82,7 +83,7 @@
#endif
#include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
#include "../smpboot.h"
#define CREATE_TRACE_POINTS
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_set(char *cmp)
{
- char buf[64];
- char *cmp;
- int neg = 0;
int i;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
}
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ i = sched_feat_set(cmp);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq->skip_clock_update = 1;
}
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ struct task_migration_notifier tmn;
+
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+
+ tmn.task = p;
+ tmn.from_cpu = task_cpu(p);
+ tmn.to_cpu = new_cpu;
+
+ atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
}
__set_task_cpu(p, new_cpu);
@@ -1106,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
- /* Look for allowed, online CPU in same node. */
- for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return dest_cpu;
+ /*
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
+ */
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
}
for (;;) {
@@ -1497,7 +1533,8 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -1524,6 +1561,15 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -1533,7 +1579,40 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
+ p->mm->numa_scan_seq = 0;
+ }
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
/*
* fork()/clone()-time setup:
@@ -1673,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
@@ -1684,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
@@ -1886,8 +1963,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
+ context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
- rcu_switch(prev, next);
switch_to(prev, next, prev);
barrier();
@@ -1900,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -1916,23 +1992,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
@@ -2717,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (irqs_disabled())
print_irqtrace_events(prev);
dump_stack();
- add_taint(TAINT_WARN);
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
/*
@@ -2911,7 +2970,7 @@ asmlinkage void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_CONTEXT_TRACKING
asmlinkage void __sched schedule_user(void)
{
/*
@@ -2920,9 +2979,9 @@ asmlinkage void __sched schedule_user(void)
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
*/
- rcu_user_exit();
+ user_exit();
schedule();
- rcu_user_enter();
+ user_enter();
}
#endif
@@ -3027,7 +3086,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
- rcu_user_exit();
+ user_exit();
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
@@ -3199,7 +3258,8 @@ void complete_all(struct completion *x)
EXPORT_SYMBOL(complete_all);
static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
@@ -3212,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
+ timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
@@ -3223,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
return timeout ?: 1;
}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, timeout, state);
+ timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
@@ -3270,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
@@ -4029,8 +4135,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- goto out_unlock;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
retval = security_task_setscheduler(p);
if (retval)
@@ -4289,20 +4401,32 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
*/
bool __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
unsigned long flags;
- bool yielded = 0;
+ int yielded = 0;
local_irq_save(flags);
rq = this_rq();
again:
p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
double_rq_lock(rq, p_rq);
while (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
@@ -4310,13 +4434,13 @@ again:
}
if (!curr->sched_class->yield_to_task)
- goto out;
+ goto out_unlock;
if (curr->sched_class != p->sched_class)
- goto out;
+ goto out_unlock;
if (task_running(p_rq, p) || p->state)
- goto out;
+ goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
@@ -4329,11 +4453,12 @@ again:
resched_task(p_rq->curr);
}
-out:
+out_unlock:
double_rq_unlock(rq, p_rq);
+out_irq:
local_irq_restore(flags);
- if (yielded)
+ if (yielded > 0)
schedule();
return yielded;
@@ -4474,6 +4599,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
+ int ppid;
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4493,8 +4619,11 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ rcu_read_lock();
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
+ task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
@@ -4588,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
@@ -7081,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7093,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7102,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7120,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7131,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7429,6 +7566,25 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -7468,7 +7624,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
struct task_group, css);
}
-static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
{
struct task_group *tg, *parent;
@@ -7485,13 +7641,33 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
return &tg->css;
}
-static void cpu_cgroup_destroy(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *parent;
+
+ if (!cgrp->parent)
+ return 0;
+
+ parent = cgroup_tg(cgrp->parent);
+ sched_online_group(tg, parent);
+ return 0;
+}
+
+static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
sched_destroy_group(tg);
}
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_offline_group(tg);
+}
+
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
@@ -7845,8 +8021,10 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
- .create = cpu_cgroup_create,
- .destroy = cpu_cgroup_destroy,
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
@@ -7869,7 +8047,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
struct cpuacct root_cpuacct;
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
{
struct cpuacct *ca;
@@ -7899,7 +8077,7 @@ out:
}
/* destroy an existing cpu accounting group */
-static void cpuacct_destroy(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
@@ -8070,9 +8248,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
- .create = cpuacct_create,
- .destroy = cpuacct_destroy,
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
.subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
};
#endif /* CONFIG_CGROUP_CPUACCT */
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}