From af446b702c58b700cc5fa99f6edc78b99e55b995 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 10 Sep 2011 21:54:08 -0700 Subject: rcu: ->signaled better named ->fqs_state The ->signaled field was named before complications in the form of dyntick-idle mode and offlined CPUs. These complications have required that force_quiescent_state() be implemented as a state machine, instead of simply unconditionally sending reschedule IPIs. Therefore, this commit renames ->signaled to ->fqs_state to catch up with the new force_quiescent_state() reality. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 16 ++++++++-------- kernel/rcutree.h | 4 ++-- kernel/rcutree_trace.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b76d812740..5d0b55a3a8c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; NUM_RCU_LVL_3, \ NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_GP_IDLE, \ + .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ @@ -866,8 +866,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); - rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -877,7 +877,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, @@ -927,7 +927,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp = rcu_get_root(rsp); raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -991,7 +991,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->signaled = RCU_GP_IDLE; + rsp->fqs_state = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -1457,7 +1457,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_fqs_ret; /* no GP in progress, time updated. */ } rsp->fqs_active = 1; - switch (rsp->signaled) { + switch (rsp->fqs_state) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1473,7 +1473,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) force_qs_rnp(rsp, dyntick_save_progress_counter); raw_spin_lock(&rnp->lock); /* irqs already disabled */ if (rcu_gp_in_progress(rsp)) - rsp->signaled = RCU_FORCE_QS; + rsp->fqs_state = RCU_FORCE_QS; break; case RCU_FORCE_QS: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 849ce9ec51f..517f2f89a29 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -302,7 +302,7 @@ struct rcu_data { struct rcu_state *rsp; }; -/* Values for signaled field in struct rcu_state. */ +/* Values for fqs_state field in struct rcu_state. */ #define RCU_GP_IDLE 0 /* No grace period in progress. */ #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ @@ -361,7 +361,7 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 signaled ____cacheline_internodealigned_in_smp; + u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ u8 fqs_active; /* force_quiescent_state() */ /* is running. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9feffa4c069..59c7bee4ce0 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -278,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, gpnum, rsp->signaled, + rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, -- cgit v1.2.3-18-g5258 From 389abd48efe1ceacb141b2fd151263b1bc432dbc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Sep 2011 14:41:37 -0700 Subject: rcu: Avoid RCU-preempt expedited grace-period botch Because rcu_read_unlock_special() samples rcu_preempted_readers_exp(rnp) after dropping rnp->lock, the following sequence of events is possible: 1. Task A exits its RCU read-side critical section, and removes itself from the ->blkd_tasks list, releases rnp->lock, and is then preempted. Task B remains on the ->blkd_tasks list, and blocks the current expedited grace period. 2. Task B exits from its RCU read-side critical section and removes itself from the ->blkd_tasks list. Because it is the last task blocking the current expedited grace period, it ends that expedited grace period. 3. Task A resumes, and samples rcu_preempted_readers_exp(rnp) which of course indicates that nothing is blocking the nonexistent expedited grace period. Task A is again preempted. 4. Some other CPU starts an expedited grace period. There are several tasks blocking this expedited grace period queued on the same rcu_node structure that Task A was using in step 1 above. 5. Task A examines its state and incorrectly concludes that it was the last task blocking the expedited grace period on the current rcu_node structure. It therefore reports completion up the rcu_node tree. 6. The expedited grace period can then incorrectly complete before the tasks blocked on this same rcu_node structure exit their RCU read-side critical sections. Arbitrarily bad things happen. This commit therefore takes a snapshot of rcu_preempted_readers_exp(rnp) prior to dropping the lock, so that only the last task thinks that it is the last task, thus avoiding the failure scenario laid out above. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b9b9f8a418..79860531716 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; + int empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, + * so we must take a snapshot of the expedited state. */ + empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report("preempt_rcu", rnp->gpnum, @@ -406,7 +409,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ - if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + if (!empty_exp && empty_exp_now) rcu_report_exp_rnp(&rcu_preempt_state, rnp); } else { local_irq_restore(flags); -- cgit v1.2.3-18-g5258 From 7077714ec4940a6c5b1189c3afb4f47bf49ad877 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Sep 2011 13:18:44 -0700 Subject: rcu: Make synchronize_sched_expedited() better at work sharing When synchronize_sched_expedited() takes its second and subsequent snapshots of sync_sched_expedited_started, it subtracts 1. This means that the concurrent caller of synchronize_sched_expedited() that incremented to that value sees our successful completion, it will not be able to take advantage of it. This restriction is pointless, given that our full expedited grace period would have happened after the other guy started, and thus should be able to serve as a proxy for the other guy successfully executing try_stop_cpus(). This commit therefore removes the subtraction of 1. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 79860531716..708dc579634 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1910,7 +1910,7 @@ void synchronize_sched_expedited(void) * grace period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started) - 1; + snap = atomic_read(&sync_sched_expedited_started); smp_mb(); /* ensure read is before try_stop_cpus(). */ } -- cgit v1.2.3-18-g5258 From 9b2e4f1880b789be1f24f9684f7a54b90310b5c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Sep 2011 12:10:22 -0700 Subject: rcu: Track idleness independent of idle tasks Earlier versions of RCU used the scheduling-clock tick to detect idleness by checking for the idle task, but handled idleness differently for CONFIG_NO_HZ=y. But there are now a number of uses of RCU read-side critical sections in the idle task, for example, for tracing. A more fine-grained detection of idleness is therefore required. This commit presses the old dyntick-idle code into full-time service, so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is always invoked at the beginning of an idle loop iteration. Similarly, rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked at the end of an idle-loop iteration. This allows the idle task to use RCU everywhere except between consecutive rcu_idle_enter() and rcu_idle_exit() calls, in turn allowing architecture maintainers to specify exactly where in the idle loop that RCU may be used. Because some of the userspace upcall uses can result in what looks to RCU like half of an interrupt, it is not possible to expect that the irq_enter() and irq_exit() hooks will give exact counts. This patch therefore expands the ->dynticks_nesting counter to 64 bits and uses two separate bitfields to count process/idle transitions and interrupt entry/exit transitions. It is presumed that userspace upcalls do not happen in the idle loop or from usermode execution (though usermode might do a system call that results in an upcall). The counter is hard-reset on each process/idle transition, which avoids the interrupt entry/exit error from accumulating. Overflow is avoided by the 64-bitness of the ->dyntick_nesting counter. This commit also adds warnings if a non-idle task asks RCU to enter idle state (and these checks will need some adjustment before applying Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246). In addition, validation of ->dynticks and ->dynticks_nesting is added. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 124 +++++++++++++++++++++---- kernel/rcutree.c | 229 +++++++++++++++++++++++++++++++++-------------- kernel/rcutree.h | 15 +--- kernel/rcutree_trace.c | 10 +-- kernel/time/tick-sched.c | 6 +- 5 files changed, 278 insertions(+), 106 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 636af6d9c6e..3ab77bdc90c 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,31 +53,122 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -#ifdef CONFIG_NO_HZ +static long long rcu_dynticks_nesting = LLONG_MAX / 2; -static long rcu_dynticks_nesting = 1; +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +static void rcu_idle_enter_common(void) +{ + if (rcu_dynticks_nesting) { + RCU_TRACE(trace_rcu_dyntick("--=", rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting)); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + rcu_dynticks_nesting)); + ftrace_dump(DUMP_ALL); + } + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} /* - * Enter dynticks-idle mode, which is an extended quiescent state - * if we have fully entered that mode (i.e., if the new value of - * dynticks_nesting is zero). + * Enter idle, which is an extended quiescent state if we have fully + * entered that mode (i.e., if the new value of dynticks_nesting is zero). */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { - if (--rcu_dynticks_nesting == 0) - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + unsigned long flags; + + local_irq_save(flags); + rcu_dynticks_nesting = 0; + rcu_idle_enter_common(); + local_irq_restore(flags); } /* - * Exit dynticks-idle mode, so that we are no longer in an extended - * quiescent state. + * Exit an interrupt handler towards idle. + */ +void rcu_irq_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_dynticks_nesting--; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + rcu_idle_enter_common(); + local_irq_restore(flags); +} + +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +static void rcu_idle_exit_common(long long oldval) +{ + if (oldval) { + RCU_TRACE(trace_rcu_dyntick("++=", rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("End", oldval)); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + oldval)); + ftrace_dump(DUMP_ALL); + } +} + +/* + * Exit idle, so that we are no longer in an extended quiescent state. */ -void rcu_exit_nohz(void) +void rcu_idle_exit(void) { + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rcu_dynticks_nesting = LLONG_MAX / 2; + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +/* + * Enter an interrupt handler, moving away from idle. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting++; + WARN_ON_ONCE(rcu_dynticks_nesting == 0); + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +#ifdef CONFIG_PROVE_RCU + +/* + * Test whether RCU thinks that the current CPU is idle. + */ +int rcu_is_cpu_idle(void) +{ + return !rcu_dynticks_nesting; } -#endif /* #ifdef CONFIG_NO_HZ */ +#endif /* #ifdef CONFIG_PROVE_RCU */ + +/* + * Test whether the current CPU was interrupted from idle. Nested + * interrupts don't count, we must be running at the first interrupt + * level. + */ +int rcu_is_cpu_rrupt_from_idle(void) +{ + return rcu_dynticks_nesting <= 0; +} /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). @@ -126,14 +217,13 @@ void rcu_bh_qs(int cpu) /* * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. + * quiescent state, and, if so, tell RCU about it. This function must + * be called from hardirq context. It is normally called from the + * scheduling-clock interrupt. */ void rcu_check_callbacks(int cpu, int user) { - if (user || - (idle_cpu(cpu) && - !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) + if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5d0b55a3a8c..1c40326724f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -#ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = 1, + .dynticks_nesting = LLONG_MAX / 2, .dynticks = ATOMIC_INIT(1), }; -#endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ @@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* If preemptible RCU, no point in sending reschedule IPI. */ - if (rdp->preemptible) - return 0; - - /* The CPU is online, so send it a reschedule IPI. */ + /* + * The CPU is online, so send it a reschedule IPI. This forces + * it through the scheduler, and (inefficiently) also handles cases + * where idle loops fail to inform RCU about the CPU being idle. + */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); else @@ -343,51 +341,97 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#ifdef CONFIG_NO_HZ +/* + * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * + * If the new value of the ->dynticks_nesting counter now is zero, + * we really have entered idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) +{ + if (rdtp->dynticks_nesting) { + trace_rcu_dyntick("--=", rdtp->dynticks_nesting); + return; + } + trace_rcu_dyntick("Start", rdtp->dynticks_nesting); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + trace_rcu_dyntick("Error on entry: not idle task", + rdtp->dynticks_nesting); + ftrace_dump(DUMP_ALL); + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); +} /** - * rcu_enter_nohz - inform RCU that current CPU is entering nohz + * rcu_idle_enter - inform RCU that current CPU is entering idle * - * Enter nohz mode, in other words, -leave- the mode in which RCU + * Enter idle mode, in other words, -leave- the mode in which RCU * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in nohz mode, a possibility - * handled by rcu_irq_enter() and rcu_irq_exit()). + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (--rdtp->dynticks_nesting) { - local_irq_restore(flags); - return; - } - trace_rcu_dyntick("Start"); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rdtp->dynticks_nesting = 0; + rcu_idle_enter_common(rdtp); local_irq_restore(flags); } -/* - * rcu_exit_nohz - inform RCU that current CPU is leaving nohz +/** + * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. * - * Exit nohz mode, in other words, -enter- the mode in which RCU - * read-side critical sections normally occur. + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture violates this assumption, RCU will give you what you + * deserve, good and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. */ -void rcu_exit_nohz(void) +void rcu_irq_exit(void) { unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nesting++) { - local_irq_restore(flags); + rdtp->dynticks_nesting--; + WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + rcu_idle_enter_common(rdtp); + local_irq_restore(flags); +} + +/* + * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * + * If the new value of the ->dynticks_nesting counter was previously zero, + * we really have exited idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +{ + if (oldval) { + trace_rcu_dyntick("++=", rdtp->dynticks_nesting); return; } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ @@ -395,7 +439,71 @@ void rcu_exit_nohz(void) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - trace_rcu_dyntick("End"); + trace_rcu_dyntick("End", oldval); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + trace_rcu_dyntick("Error on exit: not idle task", oldval); + ftrace_dump(DUMP_ALL); + } +} + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to LLONG_MAX/2 to allow for + * the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rdtp->dynticks_nesting = LLONG_MAX / 2; + rcu_idle_exit_common(rdtp, oldval); + local_irq_restore(flags); +} + +/** + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to + * user mode! This code assumes that the idle loop never does upcalls to + * user mode. If your architecture does do upcalls from the idle loop (or + * does anything else that results in unbalanced calls to the irq_enter() + * and irq_exit() functions), RCU will give you what you deserve, good + * and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting++; + WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -442,27 +550,32 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } +#ifdef CONFIG_PROVE_RCU + /** - * rcu_irq_enter - inform RCU of entry to hard irq context + * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * - * If the CPU was idle with dynamic ticks active, this updates the - * rdtp->dynticks to let the RCU handling know that the CPU is active. + * If the current CPU is in its idle loop and is neither in an interrupt + * or NMI handler, return true. The caller must have at least disabled + * preemption. */ -void rcu_irq_enter(void) +int rcu_is_cpu_idle(void) { - rcu_exit_nohz(); + return (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; } +#endif /* #ifdef CONFIG_PROVE_RCU */ + /** - * rcu_irq_exit - inform RCU of exit from hard irq context + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle * - * If the CPU was idle with dynamic ticks active, update the rdp->dynticks - * to put let the RCU handling be aware that the CPU is going back to idle - * with no ticks. + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. */ -void rcu_irq_exit(void) +int rcu_is_cpu_rrupt_from_idle(void) { - rcu_enter_nohz(); + return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } #ifdef CONFIG_SMP @@ -512,24 +625,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#else /* #ifdef CONFIG_NO_HZ */ - -#ifdef CONFIG_SMP - -static int dyntick_save_progress_counter(struct rcu_data *rdp) -{ - return 0; -} - -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) -{ - return rcu_implicit_offline_qs(rdp); -} - -#endif /* #ifdef CONFIG_SMP */ - -#endif /* #else #ifdef CONFIG_NO_HZ */ - int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -1334,16 +1429,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). * Also schedule RCU core processing. * - * This function must be called with hardirqs disabled. It is normally + * This function must be called from hardirq context. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + if (user || rcu_is_cpu_rrupt_from_idle()) { /* * Get here if this CPU took its interrupt from user @@ -1913,9 +2006,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; -#ifdef CONFIG_NO_HZ rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1942,6 +2035,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 517f2f89a29..0963fa1541a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,9 +84,10 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ + long long dynticks_nesting; /* Track irq/process nesting level. */ + /* Process level is worth LLONG_MAX/2. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ }; /* RCU's kthread states for tracing. */ @@ -274,16 +275,12 @@ struct rcu_data { /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ -#ifdef CONFIG_NO_HZ /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ -#endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ -#ifdef CONFIG_NO_HZ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ -#endif /* #ifdef CONFIG_NO_HZ */ unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ @@ -307,11 +304,7 @@ struct rcu_data { #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS -#endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 59c7bee4ce0..654cfe67f0d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d/%d df=%lu", + seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, @@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, ",%d,%d,%d,%lu", + seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); -#ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); -#endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0b..5d9d23665f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -434,7 +434,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; - rcu_enter_nohz(); } ts->idle_sleeps++; @@ -473,6 +472,8 @@ out: ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); end: + if (inidle) + rcu_idle_enter(); local_irq_restore(flags); } @@ -529,6 +530,7 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); + rcu_idle_exit(); if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); @@ -543,8 +545,6 @@ void tick_nohz_restart_sched_tick(void) ts->inidle = 0; - rcu_exit_nohz(); - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); -- cgit v1.2.3-18-g5258 From a8eecf2248a45bf69f0625b23c003ad2ccd765ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 11:01:15 -0700 Subject: trace: Allow ftrace_dump() to be called from modules Add an EXPORT_SYMBOL_GPL() so that rcutorture can dump the trace buffer upon detection of an RCU error. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60..a043d224adf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4775,6 +4775,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { __ftrace_dump(true, oops_dump_mode); } +EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { -- cgit v1.2.3-18-g5258 From 91afaf300269aa99a4d646969b3258b74294ac4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 07:44:32 -0700 Subject: rcu: Add failure tracing to rcutorture Trace the rcutorture RCU accesses and dump the trace buffer when the first failure is detected. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 10 ++++++++++ kernel/rcutorture.c | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c5b98e565ae..92e771d7b44 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -316,3 +316,13 @@ struct debug_obj_descr rcuhead_debug_descr = { }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +{ + trace_rcu_torture_read(rcutorturename, rhp); +} +EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#endif diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 764825c2685..df35228e743 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -913,6 +913,18 @@ rcu_torture_fakewriter(void *arg) return 0; } +void rcutorture_trace_dump(void) +{ + static atomic_t beenhere = ATOMIC_INIT(0); + + if (atomic_read(&beenhere)) + return; + if (atomic_xchg(&beenhere, 1) != 0) + return; + do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); + ftrace_dump(DUMP_ALL); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -934,6 +946,7 @@ static void rcu_torture_timer(unsigned long unused) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -951,6 +964,8 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } + if (pipe_count > 1) + rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { @@ -994,6 +1009,7 @@ rcu_torture_reader(void *arg) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1009,6 +1025,8 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } + if (pipe_count > 1) + rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { -- cgit v1.2.3-18-g5258 From 34240697d619c439c55f21989680024dcb604aab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Oct 2011 11:38:52 -0700 Subject: rcu: Disable preemption in rcu_is_cpu_idle() Because rcu_is_cpu_idle() is to be used to check for extended quiescent states in RCU-preempt read-side critical sections, it cannot assume that preemption is disabled. And preemption must be disabled when accessing the dyntick-idle state, because otherwise the following sequence of events could occur: 1. Task A on CPU 1 enters rcu_is_cpu_idle() and picks up the pointer to CPU 1's per-CPU variables. 2. Task B preempts Task A and starts running on CPU 1. 3. Task A migrates to CPU 2. 4. Task B blocks, leaving CPU 1 idle. 5. Task A continues execution on CPU 2, accessing CPU 1's dyntick-idle information using the pointer fetched in step 1 above, and finds that CPU 1 is idle. 6. Task A therefore incorrectly concludes that it is executing in an extended quiescent state, possibly issuing a spurious splat. Therefore, this commit disables preemption within the rcu_is_cpu_idle() function. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1c40326724f..69b6cdd4f94 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -556,12 +556,16 @@ void rcu_nmi_exit(void) * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. The caller must have at least disabled - * preemption. + * or NMI handler, return true. */ int rcu_is_cpu_idle(void) { - return (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + int ret; + + preempt_disable(); + ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + preempt_enable(); + return ret; } #endif /* #ifdef CONFIG_PROVE_RCU */ -- cgit v1.2.3-18-g5258 From b40d293eb36ba40cd428b6d178db911174689702 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2011 07:12:34 -0700 Subject: rcu: Omit self-awaken when setting up expedited grace period When setting up an expedited grace period, if there were no readers, the task will awaken itself. This commit removes this useless self-awakening. Signed-off-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree.h | 3 ++- kernel/rcutree_plugin.h | 16 +++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 69b6cdd4f94..8afb2e89745 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1320,7 +1320,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) else raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, true); rcu_node_kthread_setaffinity(rnp, -1); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0963fa1541a..fd2f87db2ab 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 708dc579634..0f095d1cc16 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -410,7 +410,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp); + rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); } else { local_irq_restore(flags); } @@ -732,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * + * Most callers will set the "wake" flag, but the task initiating the + * expedited grace period need not wake itself. + * * Caller must hold sync_rcu_preempt_exp_mutex. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { unsigned long flags; unsigned long mask; @@ -747,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up(&sync_rcu_preempt_exp_wq); + if (wake) + wake_up(&sync_rcu_preempt_exp_wq); break; } mask = rnp->grpmask; @@ -780,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) must_wait = 1; } if (!must_wait) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } /* @@ -1072,7 +1077,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { return; } -- cgit v1.2.3-18-g5258 From a0f8eefb127f5be07628954f310a7fc8c82b2fc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Nov 2011 12:08:17 -0700 Subject: rcu: Remove redundant return from rcu_report_exp_rnp() Empty void functions do not need "return", so this commit removes it from rcu_report_exp_rnp(). Signed-off-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0f095d1cc16..7a7961feeec 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1080,7 +1080,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) { - return; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3-18-g5258 From e6b80a3b0994ea6c3d876d72464f2debbfcfeb05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 16:25:18 -0700 Subject: rcu: Detect illegal rcu dereference in extended quiescent state Report that none of the rcu read lock maps are held while in an RCU extended quiescent state (the section between rcu_idle_enter() and rcu_idle_exit()). This helps detect any use of rcu_dereference() and friends from within the section in idle where RCU is not allowed. This way we can guarantee an extended quiescent window where the CPU can be put in dyntick idle mode or can simply aoid to be part of any global grace period completion while in the idle loop. Uses of RCU from such mode are totally ignored by RCU, hence the importance of these checks. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 2 ++ kernel/rcutiny.c | 1 + kernel/rcutree.c | 1 + 3 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 92e771d7b44..2bc4e135ff2 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 3ab77bdc90c..b4e0b498176 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -157,6 +157,7 @@ int rcu_is_cpu_idle(void) { return !rcu_dynticks_nesting; } +EXPORT_SYMBOL(rcu_is_cpu_idle); #endif /* #ifdef CONFIG_PROVE_RCU */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8afb2e89745..489b62a67d3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -567,6 +567,7 @@ int rcu_is_cpu_idle(void) preempt_enable(); return ret; } +EXPORT_SYMBOL(rcu_is_cpu_idle); #endif /* #ifdef CONFIG_PROVE_RCU */ -- cgit v1.2.3-18-g5258 From 0464e937485f15d2add78e3b0f498469f4e6600d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:01 +0200 Subject: rcu: Inform the user about extended quiescent state on PROVE_RCU warning Inform the user if an RCU usage error is detected by lockdep while in an extended quiescent state (in this case, the RCU-free window in idle). This is accomplished by adding a line to the RCU lockdep splat indicating whether or not the splat occurred in extended quiescent state. Uses of RCU from within extended quiescent state mode are totally ignored by RCU, hence the importance of this diagnostic. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/lockdep.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b2e08c932d9..f45c6817770 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4170,6 +4170,28 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); + + /* + * If a CPU is in the RCU-free window in idle (ie: in the section + * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * considers that CPU to be in an "extended quiescent state", + * which means that RCU will be completely ignoring that CPU. + * Therefore, rcu_read_lock() and friends have absolutely no + * effect on a CPU running in that state. In other words, even if + * such an RCU-idle CPU has called rcu_read_lock(), RCU might well + * delete data structures out from under it. RCU really has no + * choice here: we need to keep an RCU-free window in idle where + * the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run + * in the idle task. + * + * So complain bitterly if someone does call rcu_read_lock(), + * rcu_read_lock_bh() and so on from extended quiescent states. + */ + if (rcu_is_cpu_idle()) + printk("RCU used illegally from extended quiescent state!\n"); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); -- cgit v1.2.3-18-g5258 From 280f06774afedf849f0b34248ed6aff57d0f6908 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:06 +0200 Subject: nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/softirq.c | 2 +- kernel/time/tick-sched.c | 93 +++++++++++++++++++++++++++++------------------- 2 files changed, 58 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c71d91efff..f9f2aa81ce5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -351,7 +351,7 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_stop_sched_tick(0); + tick_nohz_irq_exit(); #endif preempt_enable_no_resched(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5d9d23665f1..266c242dc35 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - struct tick_sched *ts; + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; - local_irq_save(flags); - cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - /* - * Call to tick_nohz_start_idle stops the last_update_time from being - * updated. Thus, it must not be called in the event we are called from - * irq_exit() with the prior state different than idle. - */ - if (!inidle && !ts->inidle) - goto end; - - /* - * Set ts->inidle unconditionally. Even if the system did not - * switch to NOHZ mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; - now = tick_nohz_start_idle(cpu, ts); /* @@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - goto end; + return; if (need_resched()) - goto end; + return; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; @@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) (unsigned int) local_softirq_pending()); ratelimit++; } - goto end; + return; } ts->idle_calls++; @@ -471,10 +446,54 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); -end: - if (inidle) - rcu_idle_enter(); - local_irq_restore(flags); +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * This also enters into RCU extended quiescent state so that this CPU doesn't + * need anymore to be part of any global grace period completion. This way + * the tick can be stopped safely as we don't need to report quiescent states. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + WARN_ON_ONCE(irqs_disabled()); + + local_irq_disable(); + + ts = &__get_cpu_var(tick_cpu_sched); + /* + * set ts->inidle unconditionally. even if the system did not + * switch to nohz mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + tick_nohz_stop_sched_tick(ts); + rcu_idle_enter(); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!ts->inidle) + return; + + tick_nohz_stop_sched_tick(ts); } /** @@ -516,11 +535,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task + * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. */ -void tick_nohz_restart_sched_tick(void) +void tick_nohz_idle_exit(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -- cgit v1.2.3-18-g5258 From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Oct 2011 16:01:00 +0200 Subject: nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 266c242dc35..c76aefe764b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -453,18 +453,22 @@ out: * * When the next event is more than a tick into the future, stop the idle tick * Called when we start the idle loop. - * This also enters into RCU extended quiescent state so that this CPU doesn't - * need anymore to be part of any global grace period completion. This way - * the tick can be stopped safely as we don't need to report quiescent states. + * + * If no use of RCU is made in the idle loop between + * tick_nohz_idle_enter() and tick_nohz_idle_exit() calls, then + * tick_nohz_idle_enter_norcu() should be called instead and the arch + * doesn't need to call rcu_idle_enter() and rcu_idle_exit() explicitly. + * + * Otherwise the arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. */ -void tick_nohz_idle_enter(void) +void __tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - - local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); /* * set ts->inidle unconditionally. even if the system did not @@ -473,9 +477,6 @@ void tick_nohz_idle_enter(void) */ ts->inidle = 1; tick_nohz_stop_sched_tick(ts); - rcu_idle_enter(); - - local_irq_enable(); } /** @@ -551,7 +552,7 @@ void tick_nohz_idle_exit(void) ktime_t now; local_irq_disable(); - rcu_idle_exit(); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); -- cgit v1.2.3-18-g5258 From 416eb33cd60ef405e2860a186364e57bcb2d89f6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 16:31:02 -0700 Subject: rcu: Fix early call to rcu_idle_enter() On the irq exit path, tick_nohz_irq_exit() may raise a softirq, which action leads to the wake up path and select_task_rq_fair() that makes use of rcu to iterate the domains. This is an illegal use of RCU because we may be in RCU extended quiescent state if we interrupted an RCU-idle window in the idle loop: [ 132.978883] =============================== [ 132.978883] [ INFO: suspicious RCU usage. ] [ 132.978883] ------------------------------- [ 132.978883] kernel/sched_fair.c:1707 suspicious rcu_dereference_check() usage! [ 132.978883] [ 132.978883] other info that might help us debug this: [ 132.978883] [ 132.978883] [ 132.978883] rcu_scheduler_active = 1, debug_locks = 0 [ 132.978883] RCU used illegally from extended quiescent state! [ 132.978883] 2 locks held by swapper/0: [ 132.978883] #0: (&p->pi_lock){-.-.-.}, at: [] try_to_wake_up+0x39/0x2f0 [ 132.978883] #1: (rcu_read_lock){.+.+..}, at: [] select_task_rq_fair+0x6a/0xec0 [ 132.978883] [ 132.978883] stack backtrace: [ 132.978883] Pid: 0, comm: swapper Tainted: G W 3.0.0+ #178 [ 132.978883] Call Trace: [ 132.978883] [] lockdep_rcu_suspicious+0xe6/0x100 [ 132.978883] [] select_task_rq_fair+0x749/0xec0 [ 132.978883] [] ? select_task_rq_fair+0x6a/0xec0 [ 132.978883] [] ? do_raw_spin_lock+0x54/0x150 [ 132.978883] [] ? trace_hardirqs_on+0xd/0x10 [ 132.978883] [] try_to_wake_up+0xd3/0x2f0 [ 132.978883] [] ? ktime_get+0x68/0xf0 [ 132.978883] [] wake_up_process+0x15/0x20 [ 132.978883] [] raise_softirq_irqoff+0x65/0x110 [ 132.978883] [] __hrtimer_start_range_ns+0x415/0x5a0 [ 132.978883] [] ? do_raw_spin_unlock+0x5e/0xb0 [ 132.978883] [] hrtimer_start+0x18/0x20 [ 132.978883] [] tick_nohz_stop_sched_tick+0x393/0x450 [ 132.978883] [] irq_exit+0xd2/0x100 [ 132.978883] [] do_IRQ+0x66/0xe0 [ 132.978883] [] common_interrupt+0x13/0x13 [ 132.978883] [] ? native_safe_halt+0xb/0x10 [ 132.978883] [] ? trace_hardirqs_on+0xd/0x10 [ 132.978883] [] default_idle+0xba/0x370 [ 132.978883] [] amd_e400_idle+0x5e/0x130 [ 132.978883] [] cpu_idle+0xb6/0x120 [ 132.978883] [] rest_init+0xef/0x150 [ 132.978883] [] ? rest_init+0x52/0x150 [ 132.978883] [] start_kernel+0x3da/0x3e5 [ 132.978883] [] x86_64_start_reservations+0x131/0x135 [ 132.978883] [] x86_64_start_kernel+0x103/0x112 Fix this by calling rcu_idle_enter() after tick_nohz_irq_exit(). Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index f9f2aa81ce5..4eb3a0fa351 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -347,12 +347,12 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); - rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_irq_exit(); #endif + rcu_irq_exit(); preempt_enable_no_resched(); } -- cgit v1.2.3-18-g5258 From 4145fa7fbee3ec1e61c52825b146192885d9759f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Oct 2011 15:01:54 -0700 Subject: rcu: Deconfuse dynticks entry-exit tracing The trace_rcu_dyntick() trace event did not print both the old and the new value of the nesting level, and furthermore printed only the low-order 32 bits of it. This could result in some confusion when interpreting trace-event dumps, so this commit prints both the old and the new value, prints the full 64 bits, and also selects the process-entry/exit increment to print nicely in hexadecimal. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney