aboutsummaryrefslogtreecommitdiff
path: root/kernel/rcutree.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcutree.c')
-rw-r--r--kernel/rcutree.c1010
1 files changed, 680 insertions, 330 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c202..53ae9598f79 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
*
* For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
+ * Documentation/RCU
*/
#include <linux/types.h>
#include <linux/kernel.h>
@@ -35,6 +35,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/nmi.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
@@ -45,57 +46,78 @@
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/time.h>
+#include <linux/kernel_stat.h>
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
+#include "rcutree.h"
/* Data structures. */
+static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
+
#define RCU_STATE_INITIALIZER(name) { \
.level = { &name.node[0] }, \
.levelcnt = { \
NUM_RCU_LVL_0, /* root of hierarchy. */ \
NUM_RCU_LVL_1, \
NUM_RCU_LVL_2, \
- NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+ NUM_RCU_LVL_3, \
+ NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
}, \
- .signaled = RCU_SIGNAL_INIT, \
+ .signaled = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+ .orphan_cbs_list = NULL, \
+ .orphan_cbs_tail = &name.orphan_cbs_list, \
+ .orphan_qlen = 0, \
.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
}
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static int rcu_scheduler_active __read_mostly;
+
+
+/*
+ * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
+ * permit this function to be invoked without holding the root rcu_node
+ * structure's ->lock, but of course results can be subject to change.
+ */
+static int rcu_gp_in_progress(struct rcu_state *rsp)
+{
+ return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+}
+
/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
+ * Note a quiescent state. Because we do not need to know
* how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
+ * one since the start of the grace period, this just sets a flag.
*/
-void rcu_qsctr_inc(int cpu)
+void rcu_sched_qs(int cpu)
{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ struct rcu_data *rdp;
+
+ rdp = &per_cpu(rcu_sched_data, cpu);
+ rdp->passed_quiesc_completed = rdp->gpnum - 1;
+ barrier();
rdp->passed_quiesc = 1;
- rdp->passed_quiesc_completed = rdp->completed;
+ rcu_preempt_note_context_switch(cpu);
}
-void rcu_bh_qsctr_inc(int cpu)
+void rcu_bh_qs(int cpu)
{
- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+ struct rcu_data *rdp;
+
+ rdp = &per_cpu(rcu_bh_data, cpu);
+ rdp->passed_quiesc_completed = rdp->gpnum - 1;
+ barrier();
rdp->passed_quiesc = 1;
- rdp->passed_quiesc_completed = rdp->completed;
}
#ifdef CONFIG_NO_HZ
@@ -109,16 +131,21 @@ static int blimit = 10; /* Maximum callbacks per softirq. */
static int qhimark = 10000; /* If this many pending, ignore blimit. */
static int qlowmark = 100; /* Once only this many pending, use blimit. */
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+static int rcu_pending(int cpu);
/*
- * Return the number of RCU batches processed thus far for debug & stats.
+ * Return the number of RCU-sched batches processed thus far for debug & stats.
*/
-long rcu_batches_completed(void)
+long rcu_batches_completed_sched(void)
{
- return rcu_state.completed;
+ return rcu_sched_state.completed;
}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
/*
* Return the number of RCU BH batches processed thus far for debug & stats.
@@ -144,9 +171,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- /* ACCESS_ONCE() because we are accessing outside of lock. */
- return *rdp->nxttail[RCU_DONE_TAIL] &&
- ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+ return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
}
/*
@@ -181,6 +206,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
return 1;
}
+ /* If preemptable RCU, no point in sending reschedule IPI. */
+ if (rdp->preemptable)
+ return 0;
+
/* The CPU is online, so send it a reschedule IPI. */
if (rdp->cpu != smp_processor_id())
smp_send_reschedule(rdp->cpu);
@@ -193,7 +222,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
#endif /* #ifdef CONFIG_SMP */
#ifdef CONFIG_NO_HZ
-static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
/**
* rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +241,7 @@ void rcu_enter_nohz(void)
rdtp = &__get_cpu_var(rcu_dynticks);
rdtp->dynticks++;
rdtp->dynticks_nesting--;
- WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+ WARN_ON_ONCE(rdtp->dynticks & 0x1);
local_irq_restore(flags);
}
@@ -232,7 +260,7 @@ void rcu_exit_nohz(void)
rdtp = &__get_cpu_var(rcu_dynticks);
rdtp->dynticks++;
rdtp->dynticks_nesting++;
- WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+ WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
local_irq_restore(flags);
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
@@ -251,7 +279,7 @@ void rcu_nmi_enter(void)
if (rdtp->dynticks & 0x1)
return;
rdtp->dynticks_nmi++;
- WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+ WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
@@ -270,7 +298,7 @@ void rcu_nmi_exit(void)
return;
smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
rdtp->dynticks_nmi++;
- WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+ WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
}
/**
@@ -286,7 +314,7 @@ void rcu_irq_enter(void)
if (rdtp->dynticks_nesting++)
return;
rdtp->dynticks++;
- WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+ WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
@@ -305,39 +333,20 @@ void rcu_irq_exit(void)
return;
smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
rdtp->dynticks++;
- WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+ WARN_ON_ONCE(rdtp->dynticks & 0x1);
/* If the interrupt queued a callback, get out of dyntick mode. */
- if (__get_cpu_var(rcu_data).nxtlist ||
+ if (__get_cpu_var(rcu_sched_data).nxtlist ||
__get_cpu_var(rcu_bh_data).nxtlist)
set_need_resched();
}
-/*
- * Record the specified "completed" value, which is later used to validate
- * dynticks counter manipulations. Specify "rsp->completed - 1" to
- * unconditionally invalidate any future dynticks manipulations (which is
- * useful at the beginning of a grace period).
- */
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
- rsp->dynticks_completed = comp;
-}
-
#ifdef CONFIG_SMP
/*
- * Recall the previously recorded value of the completion for dynticks.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
- return rsp->dynticks_completed;
-}
-
-/*
* Snapshot the specified CPU's dynticks counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
- * is already in a quiescent state courtesy of dynticks idle mode.
+ * is in dynticks idle mode, which is an extended quiescent state.
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
@@ -397,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#else /* #ifdef CONFIG_NO_HZ */
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
-}
-
#ifdef CONFIG_SMP
-/*
- * If there are no dynticks, then the only way that a CPU can passively
- * be in a quiescent state is to be offline. Unlike dynticks idle, which
- * is a point in time during the prior (already finished) grace period,
- * an offline CPU is always in a quiescent state, and thus can be
- * unconditionally applied. So just return the current value of completed.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
- return rsp->completed;
-}
-
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
return 0;
@@ -443,32 +436,39 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
long delta;
unsigned long flags;
struct rcu_node *rnp = rcu_get_root(rsp);
- struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
- struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
/* Only let one CPU complain about others per time interval. */
spin_lock_irqsave(&rnp->lock, flags);
delta = jiffies - rsp->jiffies_stall;
- if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
+ if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+
+ /*
+ * Now rat on any tasks that got kicked up to the root rcu_node
+ * due to CPU offlining.
+ */
+ rcu_print_task_stall(rnp);
spin_unlock_irqrestore(&rnp->lock, flags);
/* OK, time to rat on our buddy... */
printk(KERN_ERR "INFO: RCU detected CPU stalls:");
- for (; rnp_cur < rnp_end; rnp_cur++) {
- if (rnp_cur->qsmask == 0)
+ rcu_for_each_leaf_node(rsp, rnp) {
+ rcu_print_task_stall(rnp);
+ if (rnp->qsmask == 0)
continue;
- for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
- if (rnp_cur->qsmask & (1UL << cpu))
- printk(" %d", rnp_cur->grplo + cpu);
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu))
+ printk(" %d", rnp->grplo + cpu);
}
printk(" (detected by %d, t=%ld jiffies)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start));
+ trigger_all_cpu_backtrace();
+
force_quiescent_state(rsp, 0); /* Kick them all. */
}
@@ -479,12 +479,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
smp_processor_id(), jiffies - rsp->gp_start);
- dump_stack();
+ trigger_all_cpu_backtrace();
+
spin_lock_irqsave(&rnp->lock, flags);
if ((long)(jiffies - rsp->jiffies_stall) >= 0)
rsp->jiffies_stall =
jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
spin_unlock_irqrestore(&rnp->lock, flags);
+
set_need_resched(); /* kick ourselves to get things going. */
}
@@ -500,8 +502,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
- } else if (rsp->gpnum != rsp->completed &&
- delta >= RCU_STALL_RAT_DELAY) {
+ } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
/* They had two time units to dump stack, so complain. */
print_other_cpu_stall(rsp);
@@ -523,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
/*
* Update CPU-local rcu_data state to record the newly noticed grace period.
* This is used both when we started the grace period and when we notice
- * that someone else started the grace period.
+ * that someone else started the grace period. The caller must hold the
+ * ->lock of the leaf rcu_node structure corresponding to the current CPU,
+ * and must have irqs disabled.
*/
+static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ if (rdp->gpnum != rnp->gpnum) {
+ rdp->qs_pending = 1;
+ rdp->passed_quiesc = 0;
+ rdp->gpnum = rnp->gpnum;
+ }
+}
+
static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
{
- rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
- rdp->gpnum = rsp->gpnum;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ local_irq_save(flags);
+ rnp = rdp->mynode;
+ if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
+ !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+ local_irq_restore(flags);
+ return;
+ }
+ __note_new_gpnum(rsp, rnp, rdp);
+ spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -553,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended. This may be called only from the CPU to whom the rdp
+ * belongs. In addition, the corresponding leaf rcu_node structure's
+ * ->lock must be held by the caller, with irqs disabled.
+ */
+static void
+__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ /* Did another grace period end? */
+ if (rdp->completed != rnp->completed) {
+
+ /* Advance callbacks. No harm if list empty. */
+ rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+ rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+ /* Remember that we saw this grace-period completion. */
+ rdp->completed = rnp->completed;
+ }
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended. This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ local_irq_save(flags);
+ rnp = rdp->mynode;
+ if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
+ !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+ local_irq_restore(flags);
+ return;
+ }
+ __rcu_process_gp_end(rsp, rnp, rdp);
+ spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Do per-CPU grace-period initialization for running CPU. The caller
+ * must hold the lock of the leaf rcu_node structure corresponding to
+ * this CPU.
+ */
+static void
+rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ /* Prior grace period ended, so advance callbacks for current CPU. */
+ __rcu_process_gp_end(rsp, rnp, rdp);
+
+ /*
+ * Because this CPU just now started the new grace period, we know
+ * that all of its callbacks will be covered by this upcoming grace
+ * period, even the ones that were registered arbitrarily recently.
+ * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
+ *
+ * Other CPUs cannot be sure exactly when the grace period started.
+ * Therefore, their recently registered callbacks must pass through
+ * an additional RCU_NEXT_READY stage, so that they will be handled
+ * by the next RCU grace period.
+ */
+ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+ /* Set state so that this CPU will detect the next quiescent state. */
+ __note_new_gpnum(rsp, rnp, rdp);
+}
+
+/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
* the root node's ->lock, which is released before return. Hard irqs must
@@ -564,34 +658,43 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
{
struct rcu_data *rdp = rsp->rda[smp_processor_id()];
struct rcu_node *rnp = rcu_get_root(rsp);
- struct rcu_node *rnp_cur;
- struct rcu_node *rnp_end;
if (!cpu_needs_another_gp(rsp, rdp)) {
- spin_unlock_irqrestore(&rnp->lock, flags);
+ if (rnp->completed == rsp->completed) {
+ spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
+ /*
+ * Propagate new ->completed value to rcu_node structures
+ * so that other CPUs don't have to wait until the start
+ * of the next grace period to process their callbacks.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ spin_lock(&rnp->lock); /* irqs already disabled. */
+ rnp->completed = rsp->completed;
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+ local_irq_restore(flags);
return;
}
/* Advance to a new grace period and initialize state. */
rsp->gpnum++;
+ WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
record_gp_stall_check_time(rsp);
- dyntick_record_completed(rsp, rsp->completed - 1);
- note_new_gpnum(rsp, rdp);
-
- /*
- * Because we are first, we know that all our callbacks will
- * be covered by this upcoming grace period, even the ones
- * that were registered arbitrarily recently.
- */
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
/* Special-case the common single-level case. */
if (NUM_RCU_NODES == 1) {
+ rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
+ rnp->gpnum = rsp->gpnum;
+ rnp->completed = rsp->completed;
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+ rcu_start_gp_per_cpu(rsp, rnp, rdp);
spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -603,88 +706,71 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
spin_lock(&rsp->onofflock); /* irqs already disabled. */
/*
- * Set the quiescent-state-needed bits in all the non-leaf RCU
- * nodes for all currently online CPUs. This operation relies
- * on the layout of the hierarchy within the rsp->node[] array.
- * Note that other CPUs will access only the leaves of the
- * hierarchy, which still indicate that no grace period is in
- * progress. In addition, we have excluded CPU-hotplug operations.
- *
- * We therefore do not need to hold any locks. Any required
- * memory barriers will be supplied by the locks guarding the
- * leaf rcu_nodes in the hierarchy.
- */
-
- rnp_end = rsp->level[NUM_RCU_LVLS - 1];
- for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
- rnp_cur->qsmask = rnp_cur->qsmaskinit;
-
- /*
- * Now set up the leaf nodes. Here we must be careful. First,
- * we need to hold the lock in order to exclude other CPUs, which
- * might be contending for the leaf nodes' locks. Second, as
- * soon as we initialize a given leaf node, its CPUs might run
- * up the rest of the hierarchy. We must therefore acquire locks
- * for each node that we touch during this stage. (But we still
- * are excluding CPU-hotplug operations.)
+ * Set the quiescent-state-needed bits in all the rcu_node
+ * structures for all currently online CPUs in breadth-first
+ * order, starting from the root rcu_node structure. This
+ * operation relies on the layout of the hierarchy within the
+ * rsp->node[] array. Note that other CPUs will access only
+ * the leaves of the hierarchy, which still indicate that no
+ * grace period is in progress, at least until the corresponding
+ * leaf node has been initialized. In addition, we have excluded
+ * CPU-hotplug operations.
*
* Note that the grace period cannot complete until we finish
* the initialization process, as there will be at least one
* qsmask bit set in the root node until that time, namely the
- * one corresponding to this CPU.
+ * one corresponding to this CPU, due to the fact that we have
+ * irqs disabled.
*/
- rnp_end = &rsp->node[NUM_RCU_NODES];
- rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
- for (; rnp_cur < rnp_end; rnp_cur++) {
- spin_lock(&rnp_cur->lock); /* irqs already disabled. */
- rnp_cur->qsmask = rnp_cur->qsmaskinit;
- spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ spin_lock(&rnp->lock); /* irqs already disabled. */
+ rcu_preempt_check_blocked_tasks(rnp);
+ rnp->qsmask = rnp->qsmaskinit;
+ rnp->gpnum = rsp->gpnum;
+ rnp->completed = rsp->completed;
+ if (rnp == rdp->mynode)
+ rcu_start_gp_per_cpu(rsp, rnp, rdp);
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
+ rnp = rcu_get_root(rsp);
+ spin_lock(&rnp->lock); /* irqs already disabled. */
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
spin_unlock_irqrestore(&rsp->onofflock, flags);
}
/*
- * Advance this CPU's callbacks, but only if the current grace period
- * has ended. This may be called only from the CPU to whom the rdp
- * belongs.
+ * Report a full set of quiescent states to the specified rcu_state
+ * data structure. This involves cleaning up after the prior grace
+ * period and letting rcu_start_gp() start up the next grace period
+ * if one is needed. Note that the caller must hold rnp->lock, as
+ * required by rcu_start_gp(), which will release it.
*/
-static void
-rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
+ __releases(rcu_get_root(rsp)->lock)
{
- long completed_snap;
- unsigned long flags;
-
- local_irq_save(flags);
- completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
-
- /* Did another grace period end? */
- if (rdp->completed != completed_snap) {
-
- /* Advance callbacks. No harm if list empty. */
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
- /* Remember that we saw this grace-period completion. */
- rdp->completed = completed_snap;
- }
- local_irq_restore(flags);
+ WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ rsp->completed = rsp->gpnum;
+ rsp->signaled = RCU_GP_IDLE;
+ rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
}
/*
- * Similar to cpu_quiet(), for which it is a helper function. Allows
- * a group of CPUs to be quieted at one go, though all the CPUs in the
- * group must be represented by the same leaf rcu_node structure.
- * That structure's lock must be held upon entry, and it is released
- * before return.
+ * Similar to rcu_report_qs_rdp(), for which it is a helper function.
+ * Allows quiescent states for a group of CPUs to be reported at one go
+ * to the specified rcu_node structure, though all the CPUs in the group
+ * must be represented by the same rcu_node structure (which need not be
+ * a leaf rcu_node structure, though it often will be). That structure's
+ * lock must be held upon entry, and it is released before return.
*/
static void
-cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
- unsigned long flags)
+rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
+ struct rcu_node *rnp_c;
+
/* Walk up the rcu_node hierarchy. */
for (;;) {
if (!(rnp->qsmask & mask)) {
@@ -694,7 +780,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
return;
}
rnp->qsmask &= ~mask;
- if (rnp->qsmask != 0) {
+ if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
/* Other bits still set at this level, so done. */
spin_unlock_irqrestore(&rnp->lock, flags);
@@ -708,31 +794,31 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
break;
}
spin_unlock_irqrestore(&rnp->lock, flags);
+ rnp_c = rnp;
rnp = rnp->parent;
spin_lock_irqsave(&rnp->lock, flags);
+ WARN_ON_ONCE(rnp_c->qsmask);
}
/*
* Get here if we are the last CPU to pass through a quiescent
- * state for this grace period. Clean up and let rcu_start_gp()
- * start up the next grace period if one is needed. Note that
- * we still hold rnp->lock, as required by rcu_start_gp(), which
- * will release it.
+ * state for this grace period. Invoke rcu_report_qs_rsp()
+ * to clean up and start the next grace period if one is needed.
*/
- rsp->completed = rsp->gpnum;
- rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
- rcu_start_gp(rsp, flags); /* releases rnp->lock. */
+ rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
}
/*
- * Record a quiescent state for the specified CPU, which must either be
- * the current CPU or an offline CPU. The lastcomp argument is used to
- * make sure we are still in the grace period of interest. We don't want
- * to end the current grace period based on quiescent states detected in
- * an earlier grace period!
+ * Record a quiescent state for the specified CPU to that CPU's rcu_data
+ * structure. This must be either called from the specified CPU, or
+ * called when the specified CPU is known to be offline (and when it is
+ * also known that no other CPU is concurrently trying to help the offline
+ * CPU). The lastcomp argument is used to make sure we are still in the
+ * grace period of interest. We don't want to end the current grace period
+ * based on quiescent states detected in an earlier grace period!
*/
static void
-cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
{
unsigned long flags;
unsigned long mask;
@@ -740,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
rnp = rdp->mynode;
spin_lock_irqsave(&rnp->lock, flags);
- if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+ if (lastcomp != rnp->completed) {
/*
* Someone beat us to it for this grace period, so leave.
* The race with GP start is resolved by the fact that we
* hold the leaf rcu_node lock, so that the per-CPU bits
* cannot yet be initialized -- so we would simply find our
- * CPU's bit already cleared in cpu_quiet_msk() if this race
- * occurred.
+ * CPU's bit already cleared in rcu_report_qs_rnp() if this
+ * race occurred.
*/
rdp->passed_quiesc = 0; /* try again later! */
spin_unlock_irqrestore(&rnp->lock, flags);
@@ -764,10 +850,9 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- rdp = rsp->rda[smp_processor_id()];
rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+ rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
}
}
@@ -798,74 +883,113 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
if (!rdp->passed_quiesc)
return;
- /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
- cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+ /*
+ * Tell RCU we are done (but rcu_report_qs_rdp() will be the
+ * judge of that).
+ */
+ rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
}
#ifdef CONFIG_HOTPLUG_CPU
/*
+ * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
+ * specified flavor of RCU. The callbacks will be adopted by the next
+ * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
+ * comes first. Because this is invoked from the CPU_DYING notifier,
+ * irqs are already disabled.
+ */
+static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+{
+ int i;
+ struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+
+ if (rdp->nxtlist == NULL)
+ return; /* irqs disabled, so comparison is stable. */
+ spin_lock(&rsp->onofflock); /* irqs already disabled. */
+ *rsp->orphan_cbs_tail = rdp->nxtlist;
+ rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+ rsp->orphan_qlen += rdp->qlen;
+ rdp->qlen = 0;
+ spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+}
+
+/*
+ * Adopt previously orphaned RCU callbacks.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ spin_lock_irqsave(&rsp->onofflock, flags);
+ rdp = rsp->rda[smp_processor_id()];
+ if (rsp->orphan_cbs_list == NULL) {
+ spin_unlock_irqrestore(&rsp->onofflock, flags);
+ return;
+ }
+ *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
+ rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
+ rdp->qlen += rsp->orphan_qlen;
+ rsp->orphan_cbs_list = NULL;
+ rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
+ rsp->orphan_qlen = 0;
+ spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
* Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
* and move all callbacks from the outgoing CPU to the current one.
*/
static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
{
- int i;
unsigned long flags;
- long lastcomp;
unsigned long mask;
+ int need_report = 0;
struct rcu_data *rdp = rsp->rda[cpu];
- struct rcu_data *rdp_me;
struct rcu_node *rnp;
/* Exclude any attempts to start a new grace period. */
spin_lock_irqsave(&rsp->onofflock, flags);
/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
- rnp = rdp->mynode;
+ rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
mask = rdp->grpmask; /* rnp->grplo is constant. */
do {
spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->qsmaskinit &= ~mask;
if (rnp->qsmaskinit != 0) {
- spin_unlock(&rnp->lock); /* irqs already disabled. */
+ if (rnp != rdp->mynode)
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
break;
}
+ if (rnp == rdp->mynode)
+ need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+ else
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
mask = rnp->grpmask;
- spin_unlock(&rnp->lock); /* irqs already disabled. */
rnp = rnp->parent;
} while (rnp != NULL);
- lastcomp = rsp->completed;
-
- spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
-
- /* Being offline is a quiescent state, so go record it. */
- cpu_quiet(cpu, rsp, rdp, lastcomp);
/*
- * Move callbacks from the outgoing CPU to the running CPU.
- * Note that the outgoing CPU is now quiscent, so it is now
- * (uncharacteristically) safe to access it rcu_data structure.
- * Note also that we must carefully retain the order of the
- * outgoing CPU's callbacks in order for rcu_barrier() to work
- * correctly. Finally, note that we start all the callbacks
- * afresh, even those that have passed through a grace period
- * and are therefore ready to invoke. The theory is that hotplug
- * events are rare, and that if they are frequent enough to
- * indefinitely delay callbacks, you have far worse things to
- * be worrying about.
+ * We still hold the leaf rcu_node structure lock here, and
+ * irqs are still disabled. The reason for this subterfuge is
+ * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
+ * held leads to deadlock.
*/
- rdp_me = rsp->rda[smp_processor_id()];
- if (rdp->nxtlist != NULL) {
- *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
- rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
- rdp_me->qlen += rdp->qlen;
- rdp->qlen = 0;
- }
- local_irq_restore(flags);
+ spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+ rnp = rdp->mynode;
+ if (need_report & RCU_OFL_TASKS_NORM_GP)
+ rcu_report_unblock_qs_rnp(rnp, flags);
+ else
+ spin_unlock_irqrestore(&rnp->lock, flags);
+ if (need_report & RCU_OFL_TASKS_EXP_GP)
+ rcu_report_exp_rnp(rsp, rnp);
+
+ rcu_adopt_orphan_cbs(rsp);
}
/*
@@ -876,12 +1000,21 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
*/
static void rcu_offline_cpu(int cpu)
{
- __rcu_offline_cpu(cpu, &rcu_state);
+ __rcu_offline_cpu(cpu, &rcu_sched_state);
__rcu_offline_cpu(cpu, &rcu_bh_state);
+ rcu_preempt_offline_cpu(cpu);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+{
+}
+
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
static void rcu_offline_cpu(int cpu)
{
}
@@ -892,7 +1025,7 @@ static void rcu_offline_cpu(int cpu)
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
*/
-static void rcu_do_batch(struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *next, *list, **tail;
@@ -945,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
+ /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+ if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = rdp->qlen;
+
local_irq_restore(flags);
/* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -963,6 +1103,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
void rcu_check_callbacks(int cpu, int user)
{
+ if (!rcu_pending(cpu))
+ return; /* if nothing for RCU to do. */
if (user ||
(idle_cpu(cpu) && rcu_scheduler_active &&
!in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1113,16 @@ void rcu_check_callbacks(int cpu, int user)
* Get here if this CPU took its interrupt from user
* mode or from the idle loop, and if this is not a
* nested interrupt. In this case, the CPU is in
- * a quiescent state, so count it.
+ * a quiescent state, so note it.
*
* No memory barrier is required here because both
- * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
- * only CPU-local variables that other CPUs neither
- * access nor modify, at least not while the corresponding
- * CPU is online.
+ * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
+ * variables that other CPUs neither access nor modify,
+ * at least not while the corresponding CPU is online.
*/
- rcu_qsctr_inc(cpu);
- rcu_bh_qsctr_inc(cpu);
+ rcu_sched_qs(cpu);
+ rcu_bh_qs(cpu);
} else if (!in_softirq()) {
@@ -989,11 +1130,12 @@ void rcu_check_callbacks(int cpu, int user)
* Get here if this CPU did not take its interrupt from
* softirq, in other words, if it is not interrupting
* a rcu_bh read-side critical section. This is an _bh
- * critical section, so count it.
+ * critical section, so note it.
*/
- rcu_bh_qsctr_inc(cpu);
+ rcu_bh_qs(cpu);
}
+ rcu_preempt_check_callbacks(cpu);
raise_softirq(RCU_SOFTIRQ);
}
@@ -1012,33 +1154,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
int cpu;
unsigned long flags;
unsigned long mask;
- struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
- struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+ struct rcu_node *rnp;
- for (; rnp_cur < rnp_end; rnp_cur++) {
+ rcu_for_each_leaf_node(rsp, rnp) {
mask = 0;
- spin_lock_irqsave(&rnp_cur->lock, flags);
- if (rsp->completed != lastcomp) {
- spin_unlock_irqrestore(&rnp_cur->lock, flags);
+ spin_lock_irqsave(&rnp->lock, flags);
+ if (rnp->completed != lastcomp) {
+ spin_unlock_irqrestore(&rnp->lock, flags);
return 1;
}
- if (rnp_cur->qsmask == 0) {
- spin_unlock_irqrestore(&rnp_cur->lock, flags);
+ if (rnp->qsmask == 0) {
+ spin_unlock_irqrestore(&rnp->lock, flags);
continu