From 2fdbb31b662787f78bb78b3e4e18f1a072058ffc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Feb 2012 15:58:29 -0800 Subject: rcu: Add RCU_FAST_NO_HZ tracing for idle exit Traces of rcu_prep_idle events can be confusing because rcu_cleanup_after_idle() does no tracing. This commit therefore adds this tracing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816b..1e561ab952a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2085,6 +2085,7 @@ static void rcu_prepare_for_idle_init(int cpu) static void rcu_cleanup_after_idle(int cpu) { hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); + trace_rcu_prep_idle("Cleanup after idle"); } /* -- cgit v1.2.3-70-g09d2 From 2ee3dc80660ac8285a37e662fd91b2e45c46f06a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Feb 2012 17:13:19 -0800 Subject: rcu: Make RCU_FAST_NO_HZ use timer rather than hrtimer The RCU_FAST_NO_HZ facility uses an hrtimer to wake up a CPU when it is allowed to go into dyntick-idle mode, which is almost always cancelled soon after. This is not what hrtimers are good at, so this commit switches to the timer wheel. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 1e561ab952a..0f007b363db 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1980,9 +1980,7 @@ static void rcu_prepare_for_idle(int cpu) static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ -static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ +static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2051,10 +2049,9 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) * real work is done upon re-entry to idle, or by the next scheduling-clock * interrupt should idle not be re-entered. */ -static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +static void rcu_idle_gp_timer_func(unsigned long unused) { trace_rcu_prep_idle("Timer"); - return HRTIMER_NORESTART; } /* @@ -2062,19 +2059,8 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) */ static void rcu_prepare_for_idle_init(int cpu) { - static int firsttime = 1; - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); - - hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtp->function = rcu_idle_gp_timer_func; - if (firsttime) { - unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); - - rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); - upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); - rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); - firsttime = 0; - } + setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_timer_func, 0); } /* @@ -2084,7 +2070,7 @@ static void rcu_prepare_for_idle_init(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); + del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); trace_rcu_prep_idle("Cleanup after idle"); } @@ -2141,11 +2127,11 @@ static void rcu_prepare_for_idle(int cpu) per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + jiffies + RCU_IDLE_GP_DELAY); else - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + jiffies + RCU_IDLE_LAZY_GP_DELAY); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ @@ -2193,14 +2179,12 @@ static void rcu_prepare_for_idle(int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); - sprintf(cp, "drain=%d %c timer=%lld", + sprintf(cp, "drain=%d %c timer=%lu", per_cpu(rcu_dyntick_drain, cpu), per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', - hrtimer_active(hrtp) - ? ktime_to_us(hrtimer_get_remaining(hrtp)) - : -1); + timer_pending(tltp) ? tltp->expires - jiffies : -1); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.3-70-g09d2 From c57afe80db4e169135eb675acc2d241e26cc064e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Feb 2012 11:02:21 -0800 Subject: rcu: Make RCU_FAST_NO_HZ account for pauses out of idle Both Steven Rostedt's new idle-capable trace macros and the RCU_NONIDLE() macro can cause RCU to momentarily pause out of idle without the rest of the system being involved. This can cause rcu_prepare_for_idle() to run through its state machine too quickly, which can in turn result in needless scheduling-clock interrupts. This commit therefore adds code to enable rcu_prepare_for_idle() to distinguish between an initial entry to idle on the one hand (which needs to advance the rcu_prepare_for_idle() state machine) and an idle reentry due to idle-capable trace macros and RCU_NONIDLE() on the other hand (which should avoid advancing the rcu_prepare_for_idle() state machine). Additional state is maintained to allow the timer to be correctly reposted when returning after a momentary pause out of idle, and even more state is maintained to detect when new non-lazy callbacks have been enqueued (which may require re-evaluation of the approach to idleness). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 ++ kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 57 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922..403306b86e7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1829,6 +1829,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->qlen++; if (lazy) rdp->qlen_lazy++; + else + rcu_idle_count_callbacks_posted(); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a407..36ca28ecedc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -471,6 +471,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0f007b363db..50c17975d4f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1938,6 +1938,14 @@ static void rcu_prepare_for_idle(int cpu) { } +/* + * Don't bother keeping a running count of the number of RCU callbacks + * posted because CONFIG_RCU_FAST_NO_HZ=n. + */ +static void rcu_idle_count_callbacks_posted(void) +{ +} + #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1981,6 +1989,10 @@ static void rcu_prepare_for_idle(int cpu) static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); +static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); +static DEFINE_PER_CPU(bool, rcu_idle_first_pass); +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1993,6 +2005,8 @@ static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); */ int rcu_needs_cpu(int cpu) { + /* Flag a new idle sojourn to the idle-entry state machine. */ + per_cpu(rcu_idle_first_pass, cpu) = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; @@ -2095,6 +2109,26 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { + /* + * If this is an idle re-entry, for example, due to use of + * RCU_NONIDLE() or the new idle-loop tracing API within the idle + * loop, then don't take any state-machine actions, unless the + * momentary exit from idle queued additional non-lazy callbacks. + * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * pending. + */ + if (!per_cpu(rcu_idle_first_pass, cpu) && + (per_cpu(rcu_nonlazy_posted, cpu) == + per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (rcu_cpu_has_callbacks(cpu)) + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + per_cpu(rcu_idle_gp_timer_expires, cpu)); + return; + } + per_cpu(rcu_idle_first_pass, cpu) = 0; + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu) - 1; + /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2127,11 +2161,15 @@ static void rcu_prepare_for_idle(int cpu) per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) - mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), - jiffies + RCU_IDLE_GP_DELAY); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_GP_DELAY; else - mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), - jiffies + RCU_IDLE_LAZY_GP_DELAY); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_LAZY_GP_DELAY; + mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), + per_cpu(rcu_idle_gp_timer_expires, cpu)); + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ @@ -2171,6 +2209,17 @@ static void rcu_prepare_for_idle(int cpu) trace_rcu_prep_idle("Callbacks drained"); } +/* + * Keep a running count of callbacks posted so that rcu_prepare_for_idle() + * can detect when something out of the idle loop posts a callback. + * Of course, it had better do so either from a trace event designed to + * be called from idle or from within RCU_NONIDLE(). + */ +static void rcu_idle_count_callbacks_posted(void) +{ + __this_cpu_add(rcu_nonlazy_posted, 1); +} + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO -- cgit v1.2.3-70-g09d2 From 79b9a75fb703b6a2670e46b9dc495af5bc7029b3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Apr 2012 10:06:39 -0700 Subject: rcu: Add warning for RCU_FAST_NO_HZ timer firing RCU_FAST_NO_HZ uses a timer to limit the time that a CPU with callbacks can remain in dyntick-idle mode. This timer is cancelled when the CPU exits idle, and therefore should never fire. However, if the timer were migrated to some other CPU for whatever reason (1) the timer could actually fire and (2) firing on some other CPU would fail to wake up the CPU with callbacks, possibly resulting in sluggishness or a system hang. This commit therfore adds a WARN_ON_ONCE() to the timer handler in order to detect this condition. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 50c17975d4f..ad61da79b31 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2065,6 +2065,7 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) */ static void rcu_idle_gp_timer_func(unsigned long unused) { + WARN_ON_ONCE(1); /* Getting here can hang the system... */ trace_rcu_prep_idle("Timer"); } -- cgit v1.2.3-70-g09d2 From f511fc624642f0bb8cf65aaa28979737514d4746 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Mar 2012 12:16:26 -0700 Subject: rcu: Ensure that RCU_FAST_NO_HZ timers expire on correct CPU Timers are subject to migration, which can lead to the following system-hang scenario when CONFIG_RCU_FAST_NO_HZ=y: 1. CPU 0 executes synchronize_rcu(), which posts an RCU callback. 2. CPU 0 then goes idle. It cannot immediately invoke the callback, but there is nothing RCU needs from ti, so it enters dyntick-idle mode after posting a timer. 3. The timer gets migrated to CPU 1. 4. CPU 0 never wakes up, so the synchronize_rcu() never returns, so the system hangs. This commit fixes this problem by using mod_timer_pinned(), as suggested by Peter Zijlstra, to ensure that the timer is actually posted on the running CPU. Reported-by: Dipankar Sarma Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ad61da79b31..d01e26df55a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2110,6 +2110,8 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { + struct timer_list *tp; + /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle @@ -2121,9 +2123,10 @@ static void rcu_prepare_for_idle(int cpu) if (!per_cpu(rcu_idle_first_pass, cpu) && (per_cpu(rcu_nonlazy_posted, cpu) == per_cpu(rcu_nonlazy_posted_snap, cpu))) { - if (rcu_cpu_has_callbacks(cpu)) - mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), - per_cpu(rcu_idle_gp_timer_expires, cpu)); + if (rcu_cpu_has_callbacks(cpu)) { + tp = &per_cpu(rcu_idle_gp_timer, cpu); + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + } return; } per_cpu(rcu_idle_first_pass, cpu) = 0; @@ -2167,8 +2170,8 @@ static void rcu_prepare_for_idle(int cpu) else per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_LAZY_GP_DELAY; - mod_timer(&per_cpu(rcu_idle_gp_timer, cpu), - per_cpu(rcu_idle_gp_timer_expires, cpu)); + tp = &per_cpu(rcu_idle_gp_timer, cpu); + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); per_cpu(rcu_nonlazy_posted_snap, cpu) = per_cpu(rcu_nonlazy_posted, cpu); return; /* Nothing more to do immediately. */ -- cgit v1.2.3-70-g09d2 From 616c310e83b872024271c915c1b9ab505b9efad9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 27 Mar 2012 16:02:08 -0700 Subject: rcu: Move PREEMPT_RCU preemption to switch_to() invocation Currently, PREEMPT_RCU readers are enqueued upon entry to the scheduler. This is inefficient because enqueuing is required only if there is a context switch, and entry to the scheduler does not guarantee a context switch. The commit therefore moves the enqueuing to immediately precede the call to switch_to() from the scheduler. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Linus Torvalds --- arch/um/drivers/mconsole_kern.c | 1 + include/linux/rcupdate.h | 1 + include/linux/rcutiny.h | 6 ------ include/linux/sched.h | 10 ++++++++++ kernel/rcutree.c | 1 - kernel/rcutree.h | 1 - kernel/rcutree_plugin.h | 14 +++----------- kernel/sched/core.c | 1 + 8 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 43b39d61b53..88e466b159d 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,6 +705,7 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; + rcu_switch_from(from); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 20fb776a1d4..bbfe7854a6a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,6 +184,7 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); +extern void rcu_preempt_note_context_switch(void); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; extern void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e93df77176d..080b5bdda28 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,10 +87,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU -static inline void rcu_preempt_note_context_switch(void) -{ -} - static inline void exit_rcu(void) { } @@ -102,7 +98,6 @@ static inline int rcu_needs_cpu(int cpu) #else /* #ifdef CONFIG_TINY_RCU */ -void rcu_preempt_note_context_switch(void); extern void exit_rcu(void); int rcu_preempt_needs_cpu(void); @@ -116,7 +111,6 @@ static inline int rcu_needs_cpu(int cpu) static inline void rcu_note_context_switch(int cpu) { rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(); } /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 81a173c0897..8f3fd945070 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1905,12 +1905,22 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_node_entry); } +static inline void rcu_switch_from(struct task_struct *prev) +{ + if (prev->rcu_read_lock_nesting != 0) + rcu_preempt_note_context_switch(); +} + #else static inline void rcu_copy_process(struct task_struct *p) { } +static inline void rcu_switch_from(struct task_struct *prev) +{ +} + #endif #ifdef CONFIG_SMP diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922..61351505ec7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -192,7 +192,6 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a407..d6b70b08a01 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -423,7 +423,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816b..b1ac22e6fa3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = __this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(cpu); + rcu_preempt_qs(smp_processor_id()); local_irq_restore(flags); } @@ -1017,14 +1017,6 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. - */ -static void rcu_preempt_note_context_switch(int cpu) -{ -} - /* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4603b9d8f30..5d89eb93f7e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2083,6 +2083,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); -- cgit v1.2.3-70-g09d2 From 9dd8fb16c36178df2066387d2abd44d8b4dca8c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Apr 2012 12:54:22 -0700 Subject: rcu: Make exit_rcu() more precise and consolidate When running preemptible RCU, if a task exits in an RCU read-side critical section having blocked within that same RCU read-side critical section, the task must be removed from the list of tasks blocking a grace period (perhaps the current grace period, perhaps the next grace period, depending on timing). The exit() path invokes exit_rcu() to do this cleanup. However, the current implementation of exit_rcu() needlessly does the cleanup even if the task did not block within the current RCU read-side critical section, which wastes time and needlessly increases the size of the state space. Fix this by only doing the cleanup if the current task is actually on the list of tasks blocking some grace period. While we are at it, consolidate the two identical exit_rcu() functions into a single function. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Linus Torvalds Conflicts: kernel/rcupdate.c --- include/linux/rcupdate.h | 1 + include/linux/rcutiny.h | 5 ----- include/linux/rcutree.h | 12 ------------ kernel/rcupdate.c | 28 ++++++++++++++++++++++++++++ kernel/rcutiny_plugin.h | 16 ---------------- kernel/rcutree_plugin.h | 16 ---------------- 6 files changed, 29 insertions(+), 49 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index bbfe7854a6a..29665a3b3ac 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -191,6 +191,7 @@ extern void rcu_idle_enter(void); extern void rcu_idle_exit(void); extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +extern void exit_rcu(void); /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 080b5bdda28..adb5e5a38ca 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,10 +87,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU -static inline void exit_rcu(void) -{ -} - static inline int rcu_needs_cpu(int cpu) { return 0; @@ -98,7 +94,6 @@ static inline int rcu_needs_cpu(int cpu) #else /* #ifdef CONFIG_TINY_RCU */ -extern void exit_rcu(void); int rcu_preempt_needs_cpu(void); static inline int rcu_needs_cpu(int cpu) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index e8ee5dd0854..782a8ab51bc 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -45,18 +45,6 @@ static inline void rcu_virt_note_context_switch(int cpu) rcu_note_context_switch(cpu); } -#ifdef CONFIG_TREE_PREEMPT_RCU - -extern void exit_rcu(void); - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -static inline void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - extern void synchronize_rcu_bh(void); extern void synchronize_sched_expedited(void); extern void synchronize_rcu_expedited(void); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc2..95cba41ce1e 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -51,6 +51,34 @@ #include "rcu.h" +#ifdef CONFIG_PREEMPT_RCU + +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +void exit_rcu(void) +{ +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb6..fc31a2d6510 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } -/* - * Check for a task exiting while in a preemptible -RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) - return; - t->rcu_read_lock_nesting = 1; - __rcu_read_unlock(); -} - #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index b1ac22e6fa3..4936fff820e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) - return; - t->rcu_read_lock_nesting = 1; - __rcu_read_unlock(); -} - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; -- cgit v1.2.3-70-g09d2 From 21e52e15666323078b8517a4312712579176b56f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2012 14:16:19 -0700 Subject: rcu: Make RCU_FAST_NO_HZ handle timer migration The current RCU_FAST_NO_HZ assumes that timers do not migrate unless a CPU goes offline, in which case it assumes that the CPU will have to come out of dyntick-idle mode (cancelling the timer) in order to go offline. This is important because when RCU_FAST_NO_HZ permits a CPU to enter dyntick-idle mode despite having RCU callbacks pending, it posts a timer on that CPU to force a wakeup on that CPU. This wakeup ensures that the CPU will eventually handle the end of the grace period, including invoking its RCU callbacks. However, Pascal Chapperon's test setup shows that the timer handler rcu_idle_gp_timer_func() really does get invoked in some cases. This is problematic because this can cause the CPU that entered dyntick-idle mode despite still having RCU callbacks pending to remain in dyntick-idle mode indefinitely, which means that its RCU callbacks might never be invoked. This situation can result in grace-period delays or even system hangs, which matches Pascal's observations of slow boot-up and shutdown (https://lkml.org/lkml/2012/4/5/142). See also the bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=806548 This commit therefore causes the "should never be invoked" timer handler rcu_idle_gp_timer_func() to use smp_call_function_single() to wake up the CPU for which the timer was intended, allowing that CPU to invoke its RCU callbacks in a timely manner. Reported-by: Pascal Chapperon Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + kernel/rcutree_plugin.h | 24 +++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index aaa55e1b8c4..1480900c511 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -292,6 +292,7 @@ TRACE_EVENT(rcu_dyntick, * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! * "Timer": Timer fired to cause CPU to continue processing callbacks. + * "Demigrate": Timer fired on wrong CPU, woke up correct CPU. * "Cleanup after idle": Idle exited, timer canceled. */ TRACE_EVENT(rcu_prep_idle, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index d01e26df55a..bbb43cad755 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2056,17 +2056,35 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) rcu_preempt_cpu_has_nonlazy_callbacks(cpu); } +/* + * Handler for smp_call_function_single(). The only point of this + * handler is to wake the CPU up, so the handler does only tracing. + */ +void rcu_idle_demigrate(void *unused) +{ + trace_rcu_prep_idle("Demigrate"); +} + /* * Timer handler used to force CPU to start pushing its remaining RCU * callbacks in the case where it entered dyntick-idle mode with callbacks * pending. The hander doesn't really need to do anything because the * real work is done upon re-entry to idle, or by the next scheduling-clock * interrupt should idle not be re-entered. + * + * One special case: the timer gets migrated without awakening the CPU + * on which the timer was scheduled on. In this case, we must wake up + * that CPU. We do so with smp_call_function_single(). */ -static void rcu_idle_gp_timer_func(unsigned long unused) +static void rcu_idle_gp_timer_func(unsigned long cpu_in) { - WARN_ON_ONCE(1); /* Getting here can hang the system... */ + int cpu = (int)cpu_in; + trace_rcu_prep_idle("Timer"); + if (cpu != smp_processor_id()) + smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); + else + WARN_ON_ONCE(1); /* Getting here can hang the system... */ } /* @@ -2075,7 +2093,7 @@ static void rcu_idle_gp_timer_func(unsigned long unused) static void rcu_prepare_for_idle_init(int cpu) { setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_timer_func, 0); + rcu_idle_gp_timer_func, cpu); } /* -- cgit v1.2.3-70-g09d2 From 98248a0e24327bc64eb7518145c44bff7bebebc3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2012 15:38:10 -0700 Subject: rcu: Explicitly initialize RCU_FAST_NO_HZ per-CPU variables The current initialization of the RCU_FAST_NO_HZ per-CPU variables makes needless and fragile assumptions about the initial value of things like the jiffies counter. This commit therefore explicitly initializes all of them that are better started with a non-zero value. It also adds some comments describing the per-CPU state variables. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree_plugin.h') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index bbb43cad755..7082ea93566 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1986,12 +1986,19 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +/* Loop counter for rcu_prepare_for_idle(). */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); +/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); +/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); +/* Enable special processing on first attempt to enter dyntick-idle mode. */ static DEFINE_PER_CPU(bool, rcu_idle_first_pass); +/* Running count of non-lazy callbacks posted, never decremented. */ static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); +/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); /* @@ -2092,8 +2099,11 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) */ static void rcu_prepare_for_idle_init(int cpu) { + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), rcu_idle_gp_timer_func, cpu); + per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; + per_cpu(rcu_idle_first_pass, cpu) = 1; } /* @@ -2232,10 +2242,12 @@ static void rcu_prepare_for_idle(int cpu) } /* - * Keep a running count of callbacks posted so that rcu_prepare_for_idle() - * can detect when something out of the idle loop posts a callback. - * Of course, it had better do so either from a trace event designed to - * be called from idle or from within RCU_NONIDLE(). + * Keep a running count of the number of non-lazy callbacks posted + * on this CPU. This running counter (which is never decremented) allows + * rcu_prepare_for_idle() to detect when something out of the idle loop + * posts a callback, even if an equal number of callbacks are invoked. + * Of course, callbacks should only be posted from within a trace event + * designed to be called from idle or from within RCU_NONIDLE(). */ static void rcu_idle_count_callbacks_posted(void) { -- cgit v1.2.3-70-g09d2