aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/checklist.txt26
-rw-r--r--Documentation/RCU/lockdep.txt5
-rw-r--r--Documentation/RCU/rcubarrier.txt15
-rw-r--r--Documentation/RCU/stallwarn.txt33
-rw-r--r--Documentation/RCU/whatisRCU.txt4
-rw-r--r--Documentation/kernel-parameters.txt35
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt202
-rw-r--r--include/linux/list_bl.h5
-rw-r--r--include/linux/rculist_bl.h2
-rw-r--r--include/linux/rcupdate.h1
-rw-r--r--include/trace/events/rcu.h55
-rw-r--r--init/Kconfig73
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h41
-rw-r--r--kernel/rcutree_plugin.h601
-rw-r--r--kernel/rcutree_trace.c2
16 files changed, 842 insertions, 518 deletions
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 31ef8fe07f8..79e789b8b8e 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -217,9 +217,14 @@ over a rather long period of time, but improvements are always welcome!
whether the increased speed is worth it.
8. Although synchronize_rcu() is slower than is call_rcu(), it
- usually results in simpler code. So, unless update performance
- is critically important or the updaters cannot block,
- synchronize_rcu() should be used in preference to call_rcu().
+ usually results in simpler code. So, unless update performance is
+ critically important, the updaters cannot block, or the latency of
+ synchronize_rcu() is visible from userspace, synchronize_rcu()
+ should be used in preference to call_rcu(). Furthermore,
+ kfree_rcu() usually results in even simpler code than does
+ synchronize_rcu() without synchronize_rcu()'s multi-millisecond
+ latency. So please take advantage of kfree_rcu()'s "fire and
+ forget" memory-freeing capabilities where it applies.
An especially important property of the synchronize_rcu()
primitive is that it automatically self-limits: if grace periods
@@ -268,7 +273,8 @@ over a rather long period of time, but improvements are always welcome!
e. Periodically invoke synchronize_rcu(), permitting a limited
number of updates per grace period.
- The same cautions apply to call_rcu_bh() and call_rcu_sched().
+ The same cautions apply to call_rcu_bh(), call_rcu_sched(),
+ call_srcu(), and kfree_rcu().
9. All RCU list-traversal primitives, which include
rcu_dereference(), list_for_each_entry_rcu(), and
@@ -296,9 +302,9 @@ over a rather long period of time, but improvements are always welcome!
all currently executing rcu_read_lock()-protected RCU read-side
critical sections complete. It does -not- necessarily guarantee
that all currently running interrupts, NMIs, preempt_disable()
- code, or idle loops will complete. Therefore, if you do not have
- rcu_read_lock()-protected read-side critical sections, do -not-
- use synchronize_rcu().
+ code, or idle loops will complete. Therefore, if your
+ read-side critical sections are protected by something other
+ than rcu_read_lock(), do -not- use synchronize_rcu().
Similarly, disabling preemption is not an acceptable substitute
for rcu_read_lock(). Code that attempts to use preemption
@@ -401,9 +407,9 @@ over a rather long period of time, but improvements are always welcome!
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
-17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
- the __rcu sparse checks to validate your RCU code. These
- can help find problems as follows:
+17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
+ __rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
+ validate your RCU code. These can help find problems as follows:
CONFIG_PROVE_RCU: check that accesses to RCU-protected data
structures are carried out under the proper RCU
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt
index a102d4b3724..cd83d2348fe 100644
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt
@@ -64,6 +64,11 @@ checking of rcu_dereference() primitives:
but retain the compiler constraints that prevent duplicating
or coalescsing. This is useful when when testing the
value of the pointer itself, for example, against NULL.
+ rcu_access_index(idx):
+ Return the value of the index and omit all barriers, but
+ retain the compiler constraints that prevent duplicating
+ or coalescsing. This is useful when when testing the
+ value of the index itself, for example, against -1.
The rcu_dereference_check() check expression can be any boolean
expression, but would normally include a lockdep expression. However,
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index 38428c12513..2e319d1b9ef 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -79,7 +79,20 @@ complete. Pseudo-code using rcu_barrier() is as follows:
2. Execute rcu_barrier().
3. Allow the module to be unloaded.
-The rcutorture module makes use of rcu_barrier in its exit function
+There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier()
+functions for the other flavors of RCU, and you of course must match
+the flavor of rcu_barrier() with that of call_rcu(). If your module
+uses multiple flavors of call_rcu(), then it must also use multiple
+flavors of rcu_barrier() when unloading that module. For example, if
+it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on
+srcu_struct_2(), then the following three lines of code will be required
+when unloading:
+
+ 1 rcu_barrier_bh();
+ 2 srcu_barrier(&srcu_struct_1);
+ 3 srcu_barrier(&srcu_struct_2);
+
+The rcutorture module makes use of rcu_barrier() in its exit function
as follows:
1 static void
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1927151b386..e38b8df3d72 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -92,14 +92,14 @@ If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set,
more information is printed with the stall-warning message, for example:
INFO: rcu_preempt detected stall on CPU
- 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0
+ 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
(t=65000 jiffies)
In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
printed:
INFO: rcu_preempt detected stall on CPU
- 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
+ 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
(t=65000 jiffies)
The "(64628 ticks this GP)" indicates that this CPU has taken more
@@ -116,13 +116,28 @@ number between the two "/"s is the value of the nesting, which will
be a small positive number if in the idle loop and a very large positive
number (as shown above) otherwise.
-For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
-not in the process of trying to force itself into dyntick-idle state, the
-"." indicates that the CPU has not given up forcing RCU into dyntick-idle
-mode (it would be "H" otherwise), and the "timer not pending" indicates
-that the CPU has not recently forced RCU into dyntick-idle mode (it
-would otherwise indicate the number of microseconds remaining in this
-forced state).
+The "softirq=" portion of the message tracks the number of RCU softirq
+handlers that the stalled CPU has executed. The number before the "/"
+is the number that had executed since boot at the time that this CPU
+last noted the beginning of a grace period, which might be the current
+(stalled) grace period, or it might be some earlier grace period (for
+example, if the CPU might have been in dyntick-idle mode for an extended
+time period. The number after the "/" is the number that have executed
+since boot until the current time. If this latter number stays constant
+across repeated stall-warning messages, it is possible that RCU's softirq
+handlers are no longer able to execute on this CPU. This can happen if
+the stalled CPU is spinning with interrupts are disabled, or, in -rt
+kernels, if a high-priority process is starving RCU's softirq handler.
+
+For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
+low-order 16 bits (in hex) of the jiffies counter when this CPU last
+invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
+rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:"
+prints the number of non-lazy callbacks posted since the last call to
+rcu_needs_cpu(). Finally, an "L" indicates that there are currently
+no non-lazy callbacks ("." is printed otherwise, as shown above) and
+"D" indicates that dyntick-idle processing is enabled ("." is printed
+otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
Multiple Warnings From One Stall
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 0cc7820967f..10df0b82f45 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -265,9 +265,9 @@ rcu_dereference()
rcu_read_lock();
p = rcu_dereference(head.next);
rcu_read_unlock();
- x = p->address;
+ x = p->address; /* BUG!!! */
rcu_read_lock();
- y = p->data;
+ y = p->data; /* BUG!!! */
rcu_read_unlock();
Holding a reference from one RCU read-side critical section
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8ccbf27aead..52ecc9b8467 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2484,9 +2484,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
the specified list of CPUs to be no-callback CPUs.
Invocation of these CPUs' RCU callbacks will
- be offloaded to "rcuoN" kthreads created for
- that purpose. This reduces OS jitter on the
+ be offloaded to "rcuox/N" kthreads created for
+ that purpose, where "x" is "b" for RCU-bh, "p"
+ for RCU-preempt, and "s" for RCU-sched, and "N"
+ is the CPU number. This reduces OS jitter on the
offloaded CPUs, which can be useful for HPC and
+
real-time workloads. It can also improve energy
efficiency for asymmetric multiprocessors.
@@ -2510,6 +2513,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
leaf rcu_node structure. Useful for very large
systems.
+ rcutree.jiffies_till_first_fqs= [KNL,BOOT]
+ Set delay from grace-period initialization to
+ first attempt to force quiescent states.
+ Units are jiffies, minimum value is zero,
+ and maximum value is HZ.
+
+ rcutree.jiffies_till_next_fqs= [KNL,BOOT]
+ Set delay between subsequent attempts to force
+ quiescent states. Units are jiffies, minimum
+ value is one, and maximum value is HZ.
+
rcutree.qhimark= [KNL,BOOT]
Set threshold of queued
RCU callbacks over which batch limiting is disabled.
@@ -2524,16 +2538,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
Set timeout for RCU CPU stall warning messages.
- rcutree.jiffies_till_first_fqs= [KNL,BOOT]
- Set delay from grace-period initialization to
- first attempt to force quiescent states.
- Units are jiffies, minimum value is zero,
- and maximum value is HZ.
+ rcutree.rcu_idle_gp_delay= [KNL,BOOT]
+ Set wakeup interval for idle CPUs that have
+ RCU callbacks (RCU_FAST_NO_HZ=y).
- rcutree.jiffies_till_next_fqs= [KNL,BOOT]
- Set delay between subsequent attempts to force
- quiescent states. Units are jiffies, minimum
- value is one, and maximum value is HZ.
+ rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
+ Set wakeup interval for idle CPUs that have
+ only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
+ Lazy RCU callbacks are those which RCU can
+ prove do nothing more than free memory.
rcutorture.fqs_duration= [KNL,BOOT]
Set duration of force_quiescent_state bursts.
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
new file mode 100644
index 00000000000..cbf7ae412da
--- /dev/null
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -0,0 +1,202 @@
+REDUCING OS JITTER DUE TO PER-CPU KTHREADS
+
+This document lists per-CPU kthreads in the Linux kernel and presents
+options to control their OS jitter. Note that non-per-CPU kthreads are
+not listed here. To reduce OS jitter from non-per-CPU kthreads, bind
+them to a "housekeeping" CPU dedicated to such work.
+
+
+REFERENCES
+
+o Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
+
+o Documentation/cgroups: Using cgroups to bind tasks to sets of CPUs.
+
+o man taskset: Using the taskset command to bind tasks to sets
+ of CPUs.
+
+o man sched_setaffinity: Using the sched_setaffinity() system
+ call to bind tasks to sets of CPUs.
+
+o /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
+ writing "0" to offline and "1" to online.
+
+o In order to locate kernel-generated OS jitter on CPU N:
+
+ cd /sys/kernel/debug/tracing
+ echo 1 > max_graph_depth # Increase the "1" for more detail
+ echo function_graph > current_tracer
+ # run workload
+ cat per_cpu/cpuN/trace
+
+
+KTHREADS
+
+Name: ehca_comp/%u
+Purpose: Periodically process Infiniband-related work.
+To reduce its OS jitter, do any of the following:
+1. Don't use eHCA Infiniband hardware, instead choosing hardware
+ that does not require per-CPU kthreads. This will prevent these
+ kthreads from being created in the first place. (This will
+ work for most people, as this hardware, though important, is
+ relatively old and is produced in relatively low unit volumes.)
+2. Do all eHCA-Infiniband-related work on other CPUs, including
+ interrupts.
+3. Rework the eHCA driver so that its per-CPU kthreads are
+ provisioned only on selected CPUs.
+
+
+Name: irq/%d-%s
+Purpose: Handle threaded interrupts.
+To reduce its OS jitter, do the following:
+1. Use irq affinity to force the irq threads to execute on
+ some other CPU.
+
+Name: kcmtpd_ctr_%d
+Purpose: Handle Bluetooth work.
+To reduce its OS jitter, do one of the following:
+1. Don't use Bluetooth, in which case these kthreads won't be
+ created in the first place.
+2. Use irq affinity to force Bluetooth-related interrupts to
+ occur on some other CPU and furthermore initiate all
+ Bluetooth activity on some other CPU.
+
+Name: ksoftirqd/%u
+Purpose: Execute softirq handlers when threaded or when under heavy load.
+To reduce its OS jitter, each softirq vector must be handled
+separately as follows:
+TIMER_SOFTIRQ: Do all of the following:
+1. To the extent possible, keep the CPU out of the kernel when it
+ is non-idle, for example, by avoiding system calls and by forcing
+ both kernel threads and interrupts to execute elsewhere.
+2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force
+ the CPU offline, then bring it back online. This forces
+ recurring timers to migrate elsewhere. If you are concerned
+ with multiple CPUs, force them all offline before bringing the
+ first one back online. Once you have onlined the CPUs in question,
+ do not offline any other CPUs, because doing so could force the
+ timer back onto one of the CPUs in question.
+NET_TX_SOFTIRQ and NET_RX_SOFTIRQ: Do all of the following:
+1. Force networking interrupts onto other CPUs.
+2. Initiate any network I/O on other CPUs.
+3. Once your application has started, prevent CPU-hotplug operations
+ from being initiated from tasks that might run on the CPU to
+ be de-jittered. (It is OK to force this CPU offline and then
+ bring it back online before you start your application.)
+BLOCK_SOFTIRQ: Do all of the following:
+1. Force block-device interrupts onto some other CPU.
+2. Initiate any block I/O on other CPUs.
+3. Once your application has started, prevent CPU-hotplug operations
+ from being initiated from tasks that might run on the CPU to
+ be de-jittered. (It is OK to force this CPU offline and then
+ bring it back online before you start your application.)
+BLOCK_IOPOLL_SOFTIRQ: Do all of the following:
+1. Force block-device interrupts onto some other CPU.
+2. Initiate any block I/O and block-I/O polling on other CPUs.
+3. Once your application has started, prevent CPU-hotplug operations
+ from being initiated from tasks that might run on the CPU to
+ be de-jittered. (It is OK to force this CPU offline and then
+ bring it back online before you start your application.)
+TASKLET_SOFTIRQ: Do one or more of the following:
+1. Avoid use of drivers that use tasklets. (Such drivers will contain
+ calls to things like tasklet_schedule().)
+2. Convert all drivers that you must use from tasklets to workqueues.
+3. Force interrupts for drivers using tasklets onto other CPUs,
+ and also do I/O involving these drivers on other CPUs.
+SCHED_SOFTIRQ: Do all of the following:
+1. Avoid sending scheduler IPIs to the CPU to be de-jittered,
+ for example, ensure that at most one runnable kthread is present
+ on that CPU. If a thread that expects to run on the de-jittered
+ CPU awakens, the scheduler will send an IPI that can result in
+ a subsequent SCHED_SOFTIRQ.
+2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
+ CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU
+ to be de-jittered is marked as an adaptive-ticks CPU using the
+ "nohz_full=" boot parameter. This reduces the number of
+ scheduler-clock interrupts that the de-jittered CPU receives,
+ minimizing its chances of being selected to do the load balancing
+ work that runs in SCHED_SOFTIRQ context.
+3. To the extent possible, keep the CPU out of the kernel when it
+ is non-idle, for example, by avoiding system calls and by
+ forcing both kernel threads and interrupts to execute elsewhere.
+ This further reduces the number of scheduler-clock interrupts
+ received by the de-jittered CPU.
+HRTIMER_SOFTIRQ: Do all of the following:
+1. To the extent possible, keep the CPU out of the kernel when it
+ is non-idle. For example, avoid system calls and force both
+ kernel threads and interrupts to execute elsewhere.
+2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the
+ CPU offline, then bring it back online. This forces recurring
+ timers to migrate elsewhere. If you are concerned with multiple
+ CPUs, force them all offline before bringing the first one
+ back online. Once you have onlined the CPUs in question, do not
+ offline any other CPUs, because doing so could force the timer
+ back onto one of the CPUs in question.
+RCU_SOFTIRQ: Do at least one of the following:
+1. Offload callbacks and keep the CPU in either dyntick-idle or
+ adaptive-ticks state by doing all of the following:
+ a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
+ CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU
+ to be de-jittered is marked as an adaptive-ticks CPU using
+ the "nohz_full=" boot parameter. Bind the rcuo kthreads
+ to housekeeping CPUs, which can tolerate OS jitter.
+ b. To the extent possible, keep the CPU out of the kernel
+ when it is non-idle, for example, by avoiding system
+ calls and by forcing both kernel threads and interrupts
+ to execute elsewhere.
+2. Enable RCU to do its processing remotely via dyntick-idle by
+ doing all of the following:
+ a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+ b. Ensure that the CPU goes idle frequently, allowing other
+ CPUs to detect that it has passed through an RCU quiescent
+ state. If the kernel is built with CONFIG_NO_HZ_FULL=y,
+ userspace execution also allows other CPUs to detect that
+ the CPU in question has passed through a quiescent state.
+ c. To the extent possible, keep the CPU out of the kernel
+ when it is non-idle, for example, by avoiding system
+ calls and by forcing both kernel threads and interrupts
+ to execute elsewhere.
+
+Name: rcuc/%u
+Purpose: Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
+To reduce its OS jitter, do at least one of the following:
+1. Build the kernel with CONFIG_PREEMPT=n. This prevents these
+ kthreads from being created in the first place, and also obviates
+ the need for RCU priority boosting. This approach is feasible
+ for workloads that do not require high degrees of responsiveness.
+2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these
+ kthreads from being created in the first place. This approach
+ is feasible only if your workload never requires RCU priority
+ boosting, for example, if you ensure frequent idle time on all
+ CPUs that might execute within the kernel.
+3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y,
+ which offloads all RCU callbacks to kthreads that can be moved
+ off of CPUs susceptible to OS jitter. This approach prevents the
+ rcuc/%u kthreads from having any work to do, so that they are
+ never awakened.
+4. Ensure that the CPU never enters the kernel, and, in particular,
+ avoid initiating any CPU hotplug operations on this CPU. This is
+ another way of preventing any callbacks from being queued on the
+ CPU, again preventing the rcuc/%u kthreads from having any work
+ to do.
+
+Name: rcuob/%d, rcuop/%d, and rcuos/%d
+Purpose: Offload RCU callbacks from the corresponding CPU.
+To reduce its OS jitter, do at least one of the following:
+1. Use affinity, cgroups, or other mechanism to force these kthreads
+ to execute on some other CPU.
+2. Build with CONFIG_RCU_NOCB_CPUS=n, which will prevent these
+ kthreads from being created in the first place. However, please
+ note that this will not eliminate OS jitter, but will instead
+ shift it to RCU_SOFTIRQ.
+
+Name: watchdog/%u
+Purpose: Detect software lockups on each CPU.
+To reduce its OS jitter, do at least one of the following:
+1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
+ kthreads from being created in the first place.
+2. Echo a zero to /proc/sys/kernel/watchdog to disable the
+ watchdog timer.
+3. Echo a large number of /proc/sys/kernel/watchdog_thresh in
+ order to reduce the frequency of OS jitter due to the watchdog
+ timer down to a level that is acceptable for your workload.
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 31f9d75adc5..2eb88556c5c 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -125,6 +125,11 @@ static inline void hlist_bl_unlock(struct hlist_bl_head *b)
__bit_spin_unlock(0, (unsigned long *)b);
}
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+ return bit_spin_is_locked(0, (unsigned long *)b);
+}
+
/**
* hlist_bl_for_each_entry - iterate over list of given type
* @tpos: the type * to use as a loop cursor.
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index cf1244fbf3b..4f216c59e7d 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -20,7 +20,7 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
return (struct hlist_bl_node *)
- ((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
+ ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}
/**
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b758ce17b30..9ed2c9a4de4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -80,6 +80,7 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
+#define ulong2long(a) (*(long *)(&(a)))
/* Exported common interfaces */
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 1918e832da4..59ebcc89f14 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -72,6 +72,58 @@ TRACE_EVENT(rcu_grace_period,
);
/*
+ * Tracepoint for future grace-period events, including those for no-callbacks
+ * CPUs. The caller should pull the data from the rcu_node structure,
+ * other than rcuname, which comes from the rcu_state structure, and event,
+ * which is one of the following:
+ *
+ * "Startleaf": Request a nocb grace period based on leaf-node data.
+ * "Startedleaf": Leaf-node start proved sufficient.
+ * "Startedleafroot": Leaf-node start proved sufficient after checking root.
+ * "Startedroot": Requested a nocb grace period based on root-node data.
+ * "StartWait": Start waiting for the requested grace period.
+ * "ResumeWait": Resume waiting after signal.
+ * "EndWait": Complete wait.
+ * "Cleanup": Clean up rcu_node structure after previous GP.
+ * "CleanupMore": Clean up, and another no-CB GP is needed.
+ */
+TRACE_EVENT(rcu_future_grace_period,
+
+ TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
+ unsigned long c, u8 level, int grplo, int grphi,
+ char *gpevent),
+
+ TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
+
+ TP_STRUCT__entry(
+ __field(char *, rcuname)
+ __field(unsigned long, gpnum)
+ __field(unsigned long, completed)
+ __field(unsigned long, c)
+ __field(u8, level)
+ __field(int, grplo)
+ __field(int, grphi)
+ __field(char *, gpevent)
+ ),
+
+ TP_fast_assign(
+ __entry->rcuname = rcuname;
+ __entry->gpnum = gpnum;
+ __entry->completed = completed;
+ __entry->c = c;
+ __entry->level = level;
+ __entry->grplo = grplo;
+ __entry->grphi = grphi;
+ __entry->gpevent = gpevent;
+ ),
+
+ TP_printk("%s %lu %lu %lu %u %d %d %s",
+ __entry->rcuname, __entry->gpnum, __entry->completed,
+ __entry->c, __entry->level, __entry->grplo, __entry->grphi,
+ __entry->gpevent)
+);
+
+/*
* Tracepoint for grace-period-initialization events. These are
* distinguished by the type of RCU, the new grace-period number, the
* rcu_node structure level, the starting and ending CPU covered by the
@@ -601,6 +653,9 @@ TRACE_EVENT(rcu_barrier,
#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
qsmask) do { } while (0)
+#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
+ level, grplo, grphi, event) \
+ do { } while (0)
#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
diff --git a/init/Kconfig b/init/Kconfig
index 5341d7232c3..71bb9e73011 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -578,13 +578,16 @@ config RCU_FAST_NO_HZ
depends on NO_HZ && SMP
default n
help
- This option causes RCU to attempt to accelerate grace periods in
- order to allow CPUs to enter dynticks-idle state more quickly.
- On the other hand, this option increases the overhead of the
- dynticks-idle checking, thus degrading scheduling latency.
+ This option permits CPUs to enter dynticks-idle state even if
+ they have RCU callbacks queued, and prevents RCU from waking
+ these CPUs up more than roughly once every four jiffies (by
+ default, you can adjust this using the rcutree.rcu_idle_gp_delay
+ parameter), thus improving energy efficiency. On the other
+ hand, this option increases the duration of RCU grace periods,
+ for example, slowing down synchronize_rcu().
- Say Y if energy efficiency is critically important, and you don't
- care about real-time response.
+ Say Y if energy efficiency is critically important, and you
+ don't care about increased grace-period durations.
Say N if you are unsure.
@@ -651,7 +654,7 @@ config RCU_BOOST_DELAY
Accept the default if unsure.
config RCU_NOCB_CPU
- bool "Offload RCU callback processing from boot-selected CPUs"
+ bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
depends on TREE_RCU || TREE_PREEMPT_RCU
default n
help
@@ -662,16 +665,56 @@ config RCU_NOCB_CPU
This option offloads callback invocation from the set of
CPUs specified at boot time by the rcu_nocbs parameter.
- For each such CPU, a kthread ("rcuoN") will be created to
- invoke callbacks, where the "N" is the CPU being offloaded.
- Nothing prevents this kthread from running on the specified
- CPUs, but (1) the kthreads may be preempted between each
- callback, and (2) affinity or cgroups can be used to force
- the kthreads to run on whatever set of CPUs is desired.
-
- Say Y here if you want reduced OS jitter on selected CPUs.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+ "s" for RCU-sched. Nothing prevents this kthread from running
+ on the specified CPUs, but (1) the kthreads may be preempted
+ between each callback, and (2) affinity or cgroups can be used
+ to force the kthreads to run on whatever set of CPUs is desired.
+
+ Say Y here if you want to help to debug reduced OS jitter.
Say N here if you are unsure.
+choice
+ prompt "Build-forced no-CBs CPUs"
+ default RCU_NOCB_CPU_NONE
+ help
+ This option allows no-CBs CPUs to be specified at build time.
+ Additional no-CBs CPUs may be specified by the rcu_nocbs=
+ boot parameter.
+
+config RCU_NOCB_CPU_NONE
+ bool "No build_forced no-CBs CPUs"
+ depends on RCU_NOCB_CPU
+ help
+ This option does not force any of the CPUs to be no-CBs CPUs.
+ Only CPUs designated by the rcu_nocbs= boot parameter will be
+ no-CBs CPUs.
+
+config RCU_NOCB_CPU_ZERO
+ bool "CPU 0 is a build_forced no-CBs CPU"
+ depends on RCU_NOCB_CPU
+ help
+ This option forces CPU 0 to be a no-CBs CPU. Additional CPUs
+ may be designated as no-CBs CPUs using the rcu_nocbs= boot
+ parameter will be no-CBs CPUs.
+
+ Select this if CPU 0 needs to be a no-CBs CPU for real-time
+ or energy-efficiency reasons.
+
+config RCU_NOCB_CPU_ALL
+ bool "All CPUs are build_forced no-CBs CPUs"
+ depends on RCU_NOCB_CPU
+ help
+ This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
+ boot parameter will be ignored.
+
+ Select this if all CPUs need to be no-CBs CPUs for real-time
+ or energy-efficiency reasons.
+
+endchoice
+
endmenu # "RCU Subsystem"
config IKCONFIG
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5b8ad827fd8..2d5f94c1c7f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(sname, cr) { \
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
.level = { &sname##_state.node[0] }, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
+ .abbr = sabbr, \
}
struct rcu_state rcu_sched_state =
- RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_gp_in_progress(rsp))
return 0; /* No, a grace period is already in progress. */
+ if (rcu_nocb_needs_gp(rsp))
+ return 1; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
return 0; /* No, this is a no-CBs (or offline) CPU. */
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp)
{
int i;
+ if (init_nocb_callback_list(rdp))
+ return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- init_nocb_callback_list(rdp);
}
/*
@@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
}
/*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, char *s)
+{
+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+ rnp->completed, c, rnp->level,
+ rnp->grplo, rnp->grphi, s);
+}
+
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks. The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static unsigned long __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ /*
+ * Pick up grace-period number for new callbacks. If this
+ * grace period is already marked as needed, return to the caller.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+ if (rnp->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+ return c;
+ }
+
+ /*
+ * If either this rcu_node structure or the root rcu_node structure
+ * believe that a grace period is in progress, then we must wait
+ * for the one following, which is in "c". Because our request
+ * will be noticed at the end of the current grace period, we don't
+ * need to explicitly start one.
+ */
+ if (rnp->gpnum != rnp->completed ||
+ ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ rnp->need_future_gp[c & 0x1]++;
+ trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+ return c;
+ }
+
+ /*
+ * There might be no grace period in progress. If we don't already
+ * hold it, acquire the root rcu_node structure's lock in order to
+ * start one (if needed).
+ */
+ if (rnp != rnp_root)
+ raw_spin_lock(&rnp_root->lock);
+
+ /*
+ * Get a new grace-period number. If there really is no grace
+ * period in progress, it will be smaller than the one we obtained
+ * earlier. Adjust callbacks as needed. Note that even no-CBs
+ * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp_root);
+ for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+ rdp->nxtcompleted[i] = c;
+
+ /*
+ * If the needed for the required grace period is already
+ * recorded, trace and leave.
+ */
+ if (rnp_root->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+ goto unlock_out;
+ }
+
+ /* Record the need for the future grace period. */
+ rnp_root->need_future_gp[c & 0x1]++;
+
+ /* If a grace period is not already in progress, start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+ } else {
+ trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+ rcu_start_gp_advanced(r