aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-12-23 12:57:04 +0100
committerIngo Molnar <mingo@elte.hu>2010-12-23 12:57:04 +0100
commit394f4528c523d88daabd50f883a8d6b164075555 (patch)
treeb45a5b87a1ba9be8afe71f1db1537ff19e2b05d8
parent90a8a73c06cc32b609a880d48449d7083327e11a (diff)
parent3c2dcf2aed5ea22ecf65a9a871c4963faec421b3 (diff)
Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu into core/rcu
-rw-r--r--Documentation/RCU/trace.txt144
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/rculist.h5
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--include/linux/rcutiny.h13
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/sched.h11
-rw-r--r--init/Kconfig55
-rw-r--r--kernel/rcutiny.c105
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c156
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c69
-rw-r--r--kernel/srcu.c8
17 files changed, 1207 insertions, 285 deletions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index a851118775d..6a8c73f55b8 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
CONFIG_RCU_TRACE debugfs Files and Formats
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state. This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state. This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
The output of "cat rcu/rcudata" looks as follows:
@@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for
been registered in absence of CPU-hotplug activity.
o "co" is the number of RCU callbacks that have been orphaned due to
- this CPU going offline.
+ this CPU going offline. These orphaned callbacks have been moved
+ to an arbitrarily chosen online CPU.
o "ca" is the number of RCU callbacks that have been adopted due to
other CPUs going offline. Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is
The output of "cat rcu/rcuhier" looks as follows, with very long lines:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1/1 .>. 0:127 ^0
3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
0/1 .>. 0:127 ^0
0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that
exited immediately (without even being counted in nfqs above)
due to contention on ->fqslock.
-o "oqlen" is the number of callbacks on the "orphan" callback
- list. RCU callbacks are placed on this list by CPUs going
- offline, and are "adopted" either by the CPU helping the outgoing
- CPU or by the next rcu_barrier*() call, whichever comes first.
-
o Each element of the form "1/1 0:127 ^0" represents one struct
rcu_node. Each line represents one level of the hierarchy, from
root to leaves. It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert
readers will note that the rcu "nn" number for a given CPU very
closely matches the rcu_bh "np" number for that same CPU. This
is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+ ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+ normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+ exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds. The fields are as follows:
+
+o "qlen" is the number of RCU callbacks currently waiting either
+ for an RCU grace period or waiting to be invoked. This is the
+ only field present for rcu_sched and rcu_bh, due to the
+ short-circuiting of grace period in those two cases.
+
+o "gp" is the number of grace periods that have completed.
+
+o "g197/p197/c197" displays the grace-period state, with the
+ "g" number being the number of grace periods that have started
+ (mod 256), the "p" number being the number of grace periods
+ that the CPU has responded to (also mod 256), and the "c"
+ number being the number of grace periods that have completed
+ (once again mode 256).
+
+ Why have both "gp" and "g"? Because the data flowing into
+ "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o "tasks" is a set of bits. The first bit is "T" if there are
+ currently tasks that have recently blocked within an RCU
+ read-side critical section, the second bit is "N" if any of the
+ aforementioned tasks are blocking the current RCU grace period,
+ and the third bit is "E" if any of the aforementioned tasks are
+ blocking the current expedited grace period. Each bit is "."
+ if the corresponding condition does not hold.
+
+o "ttb" is a single bit. It is "B" if any of the blocked tasks
+ need to be priority boosted and "." otherwise.
+
+o "btg" indicates whether boosting has been carried out during
+ the current grace period, with "exp" indicating that boosting
+ is in progress for an expedited grace period, "no" indicating
+ that boosting has not yet started for a normal grace period,
+ "begun" indicating that boosting has bebug for a normal grace
+ period, and "done" indicating that boosting has completed for
+ a normal grace period.
+
+o "ntb" is the total number of tasks subjected to RCU priority boosting
+ periods since boot.
+
+o "neb" is the number of expedited grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "nnb" is the number of normal grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o "bt" is the low-order 12 bits of the value that the jiffies counter
+ will have at the next time that boosting is scheduled to begin.
+
+o In the line beginning with "normal balk", the fields are as follows:
+
+ o "nt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+ Note that the system will balk from boosting even if the
+ grace period is overdue when the currently running task
+ is looping within an RCU read-side critical section.
+ There is no point in boosting in this case, because
+ boosting a running task won't make it run any faster.
+
+ o "gt" is the number of times that the system balked
+ from boosting because, although there were blocked tasks,
+ none of them were preventing the current grace period
+ from completing.
+
+ o "bt" is the number of times that the system balked
+ from boosting because boosting was already in progress.
+
+ o "b" is the number of times that the system balked from
+ boosting because boosting had already completed for
+ the grace period in question.
+
+ o "ny" is the number of times that the system balked from
+ boosting because it was not yet time to start boosting
+ the grace period in question.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons. This can actually happen due to races involving
+ increments of the jiffies counter.
+
+o In the line beginning with "exp balk", the fields are as follows:
+
+ o "bt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons.
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f8c06ce0fa..6b281fae114 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -83,6 +83,12 @@ extern struct group_info init_groups;
*/
# define CAP_INIT_BSET CAP_FULL_SET
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST() \
+ .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
#ifdef CONFIG_TREE_PREEMPT_RCU
#define INIT_TASK_RCU_TREE_PREEMPT() \
.rcu_blocked_node = NULL,
@@ -94,7 +100,8 @@ extern struct group_info init_groups;
.rcu_read_lock_nesting = 0, \
.rcu_read_unlock_special = 0, \
.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
- INIT_TASK_RCU_TREE_PREEMPT()
+ INIT_TASK_RCU_TREE_PREEMPT() \
+ INIT_TASK_RCU_BOOST()
#else
#define INIT_TASK_RCU_PREEMPT(tsk)
#endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f31ef61f1c6..2dea94fc440 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
#define list_first_entry_rcu(ptr, type, member) \
list_entry_rcu((ptr)->next, type, member)
-#define __list_for_each_rcu(pos, head) \
- for (pos = rcu_dereference_raw(list_next_rcu(head)); \
- pos != (head); \
- pos = rcu_dereference_raw(list_next_rcu((pos)))
-
/**
* list_for_each_entry_rcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03cda7bed98..af561485628 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
extern int rcutorture_runnable; /* for sysctl */
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
extern void synchronize_sched(void);
extern void rcu_barrier_bh(void);
extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
extern int sched_expedited_torture_stats(char *page);
static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/* Internal to kernel */
-extern void rcu_init(void);
extern void rcu_sched_qs(int cpu);
extern void rcu_bh_qs(int cpu);
extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 13877cb93a6..30ebd7c8d87 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
#include <linux/cache.h>
-#define rcu_init_sched() do { } while (0)
+static inline void rcu_init(void)
+{
+}
#ifdef CONFIG_TINY_RCU
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
synchronize_sched();
}
+static inline void synchronize_sched_expedited(void)
+{
+ synchronize_sched();
+}
+
#ifdef CONFIG_TINY_RCU
static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
extern int rcu_scheduler_active __read_mostly;
extern void rcu_scheduler_starting(void);
-
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
static inline void rcu_scheduler_starting(void)
{
}
-
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 95518e62879..3a933482734 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H
+extern void rcu_init(void);
extern void rcu_note_context_switch(int cpu);
extern int rcu_needs_cpu(int cpu);
extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
extern void synchronize_rcu_expedited(void);
static inline void synchronize_rcu_bh_expedited(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 223874538b3..d8005503cc6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1229,6 +1229,9 @@ struct task_struct {
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -1759,7 +1762,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
#ifdef CONFIG_PREEMPT_RCU
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
static inline void rcu_copy_process(struct task_struct *p)
{
@@ -1767,7 +1771,10 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_read_unlock_special = 0;
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
diff --git a/init/Kconfig b/init/Kconfig
index c9728992a77..526ec1c7456 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -393,7 +393,6 @@ config PREEMPT_RCU
config RCU_TRACE
bool "Enable tracing for RCU"
- depends on TREE_RCU || TREE_PREEMPT_RCU
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
TREE_PREEMPT_RCU implementations, permitting Makefile to
trivially select kernel/rcutree_trace.c.
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && TINY_PREEMPT_RCU
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+ int "Real-time priority to boost RCU readers to"
+ range 1 99
+ depends on RCU_BOOST
+ default 1
+ help
+ This option specifies the real-time priority to which preempted
+ RCU readers are to be boosted. If you are working with CPU-bound
+ real-time applications, you should specify a priority higher then
+ the highest-priority CPU-bound application.
+
+ Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+ int "Microseconds to delay before waiting for readers"
+ range 0 20
+ default 10
+ help
+ This option controls how long SRCU delays before entering its
+ loop waiting on SRCU readers. The purpose of this loop is
+ to avoid the unconditional context-switch penalty that would
+ otherwise be incurred if there was an active SRCU reader,
+ in a manner similar to adaptive locking schemes. This should
+ be set to be a bit longer than the common-case SRCU read-side
+ critical-section overhead.
+
+ Accept the default if unsure.
+
endmenu # "RCU Subsystem"
config IKCONFIG
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342a..03449372474 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
#include <linux/time.h>
#include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
/* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
void rcu_bh_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
}
/*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
*/
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
struct rcu_head *next, *list;
unsigned long flags;
+ RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
+ local_bh_disable();
list->func(list);
+ local_bh_enable();
list = next;
+ RCU_TRACE(cb_count++);
}
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
}
/*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed. It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
{
- __rcu_process_callbacks(&rcu_sched_ctrlblk);
- __rcu_process_callbacks(&rcu_bh_ctrlblk);
- rcu_preempt_process_callbacks();
+ unsigned long work;
+ unsigned long morework;
+ unsigned long flags;
+
+ for (;;) {
+ wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+ morework = rcu_boost();
+ local_irq_save(flags);
+ work = have_rcu_kthread_work;
+ have_rcu_kthread_work = morework;
+ local_irq_restore(flags);
+ if (work) {
+ rcu_process_callbacks(&rcu_sched_ctrlblk);
+ rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
+ }
+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
+ }
+
+ return 0; /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+ local_irq_restore(flags);
}
/*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
+ RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
}
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
{
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ struct sched_param sp;
+
+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+ return 0;
}
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745f..015abaea962 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
#ifdef CONFIG_TINY_PREEMPT_RCU
#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
struct list_head *gp_tasks;
/* Pointer to the first task blocking the */
/* current grace period, or NULL if there */
- /* is not such task. */
+ /* is no such task. */
struct list_head *exp_tasks;
/* Pointer to first task blocking the */
/* current expedited grace period, or NULL */
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority-boosted, or NULL if no priority */
+ /* boosting is needed. If there is no */
+ /* current or expedited grace period, there */
+ /* can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
u8 gpnum; /* Current grace period. */
u8 gpcpu; /* Last grace period blocked by the CPU. */
u8 completed; /* Last grace period completed. */
/* If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+ s8 boosted_this_gp; /* Has boosting already happened? */
+ unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+ unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+ unsigned long n_tasks_boosted;
+ unsigned long n_exp_boosts;
+ unsigned long n_normal_boosts;
+ unsigned long n_normal_balk_blkd_tasks;
+ unsigned long n_normal_balk_gp_tasks;
+ unsigned long n_normal_balk_boost_tasks;
+ unsigned long n_normal_balk_boosted;
+ unsigned long n_normal_balk_notyet;
+ unsigned long n_normal_balk_nos;
+ unsigned long n_exp_balk_blkd_tasks;
+ unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
};
static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
}
/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+ seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+ rcu_preempt_ctrlblk.rcb.qlen,
+ rcu_preempt_ctrlblk.n_grace_periods,
+ rcu_preempt_ctrlblk.gpnum,
+ rcu_preempt_ctrlblk.gpcpu,
+ rcu_preempt_ctrlblk.completed,
+ "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+ "N."[!rcu_preempt_ctrlblk.gp_tasks],
+ "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " ttb=%c btg=",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+ switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+ case -1:
+ seq_puts(m, "exp");
+ break;
+ case 0:
+ seq_puts(m, "no");
+ break;
+ case 1:
+ seq_puts(m, "begun");
+ break;
+ case 2:
+ seq_puts(m, "done");
+ break;
+ default:
+ seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+ }
+ seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ rcu_preempt_ctrlblk.n_tasks_boosted,
+ rcu_preempt_ctrlblk.n_exp_boosts,
+ rcu_preempt_ctrlblk.n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+ seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+ "normal balk",
+ rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boosted,
+ rcu_preempt_ctrlblk.n_normal_balk_notyet,
+ rcu_preempt_ctrlblk.n_normal_balk_nos);
+ seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct list_head *np;
+ struct task_struct *t;
+
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ return 0; /* Nothing to boost. */
+ raw_local_irq_save(flags);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+ rcu_node_entry);
+ np = rcu_next_node_entry(t);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_local_irq_restore(flags);
+ rt_mutex_lock(&mtx);
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ rt_mutex_unlock(&mtx);
+ return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them. If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1. Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+ if (!rcu_preempt_blocked_readers_cgp()) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ return 0;
+ }
+ if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+ rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_boost_trace());
+ return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+ rcu_preempt_ctrlblk.boosted_this_gp = -1;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_exp_boost_trace());
+ raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+ rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+ if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+ rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*