diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 24 | ||||
-rw-r--r-- | kernel/cpuset.c | 2 | ||||
-rw-r--r-- | kernel/rcuclassic.c | 337 | ||||
-rw-r--r-- | kernel/rcupreempt.c | 8 | ||||
-rw-r--r-- | kernel/rcupreempt_trace.c | 7 | ||||
-rw-r--r-- | kernel/sched.c | 377 | ||||
-rw-r--r-- | kernel/sched_fair.c | 234 | ||||
-rw-r--r-- | kernel/sched_features.h | 1 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 6 | ||||
-rw-r--r-- | kernel/sched_rt.c | 57 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 11 | ||||
-rw-r--r-- | kernel/user.c | 4 |
12 files changed, 645 insertions, 423 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c24..86d49045dae 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -453,6 +454,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb..eab7bd6628e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index aad93cdc9f6..37f72e55154 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <linux/time.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + unsigned long flags; + + if (list) { + local_irq_save(flags); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_restore(flags); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + unsigned long flags; + + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } + unsigned long flags; + long completed_snap; - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + if (rdp->nxtlist) { + local_irq_save(flags); + completed_snap = ACCESS_ONCE(rcp->completed); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ - smp_rmb(); + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } + + local_irq_restore(flags); + + if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags2; - if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags2); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } + spin_unlock_irqrestore(&rcp->lock, flags2); } } @@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + return 1; + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; + + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0..ca4bbbe04aa 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -59,14 +59,6 @@ #include <linux/rcupreempt_trace.h> /* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* * PREEMPT_RCU data structures. */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bb..35c2d3360ec 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) diff --git a/kernel/sched.c b/kernel/sched.c index ad1962dc0aa..6f230596bd0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,16 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -604,9 +609,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1121,7 +1126,7 @@ static void init_rq_hrtick(struct rq *rq) rq->hrtick_timer.function = hrtick; rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1419,15 +1410,43 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; } +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* @@ -2285,7 +2300,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4638,6 +4662,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a |