Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu

Pull the v3.5 RCU tree from Paul E. McKenney: 1) A set of improvements and fixes to the RCU_FAST_NO_HZ feature (with more on the way for 3.6). Posted to LKML: https://lkml.org/lkml/2012/4/23/324 (commits 1-3 and 5), https://lkml.org/lkml/2012/4/16/611 (commit 4), https://lkml.org/lkml/2012/4/30/390 (commit 6), and https://lkml.org/lkml/2012/5/4/410 (commit 7, combined with the other commits for the convenience of the tester). 2) Changes to make rcu_barrier() avoid disrupting execution of CPUs that have no RCU callbacks. Posted to LKML: https://lkml.org/lkml/2012/4/23/322. 3) A couple of commits that improve the efficiency of the interaction between preemptible RCU and the scheduler, these two being all that survived an abortive attempt to allow preemptible RCU's __rcu_read_lock() to be inlined. The full set was posted to LKML at https://lkml.org/lkml/2012/4/14/143, and the first and third patches of that set remain. 4) Lai Jiangshan's algorithmic implementation of SRCU, which includes call_srcu() and srcu_barrier(). A major feature of this new implementation is that synchronize_srcu() no longer disturbs the execution of other CPUs. This work is based on earlier implementations by Peter Zijlstra and Paul E. McKenney. Posted to LKML: https://lkml.org/lkml/2012/2/22/82. 5) A number of miscellaneous bug fixes and improvements which were posted to LKML at: https://lkml.org/lkml/2012/4/23/353 with subsequent updates posted to LKML. Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2012-05-14 08:41:20 +0200
committer: Ingo Molnar <mingo@kernel.org> 2012-05-14 08:41:46 +0200
commit: 2d84e023cb5ec00403ff5d447533c6fd58fcc7ff (patch)
tree: cb10d9a568ebb4be8593821a6f205efedf2f4ddd /kernel
parent: 9ff00d58a915b6747ba2e843ab2d04c712b4dc32 (diff)
parent: dc36be4419311fd57becdf54bfeef6bd04a6741d (diff)
10 files changed, 1073 insertions, 298 deletions
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc2..95cba41ce1e 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
 
 #include "rcu.h"
 
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+	struct task_struct *t = current;
+
+	if (likely(list_empty(&current->rcu_node_entry)))
+		return;
+	t->rcu_read_lock_nesting = 1;
+	barrier();
+	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+	__rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb6..fc31a2d6510 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
 	return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
 }
 
-/*
- * Check for a task exiting while in a preemptible -RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
-	struct task_struct *t = current;
-
-	if (t->rcu_read_lock_nesting == 0)
-		return;
-	t->rcu_read_lock_nesting = 1;
-	__rcu_read_unlock();
-}
-
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
 #ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6..e66b34ab755 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration;	/* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff;		/* Hold time within burst (us). */
 static int fqs_stutter = 3;	/* Wait time between bursts (s). */
+static int n_barrier_cbs;	/* Number of callbacks to test RCU barriers. */
 static int onoff_interval;	/* Wait time between CPU hotplugs, 0=disable. */
 static int onoff_holdoff;	/* Seconds after boot before CPU hotplugs. */
 static int shutdown_secs;	/* Shutdown time (s).  <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(n_barrier_cbs, int, 0444);
+MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
 module_param(onoff_interval, int, 0444);
 MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
 module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
 static struct task_struct *onoff_task;
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
 static long n_rcu_torture_boost_ktrerror;
 static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
 static long n_offline_successes;
 static long n_online_attempts;
 static long n_online_successes;
+static long n_barrier_attempts;
+static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
 
@@ -197,6 +205,10 @@ static unsigned long shutdown_time;	/* jiffies to system shutdown. */
 static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
 					/*  and boost task create/destroy. */
+static atomic_t barrier_cbs_count;	/* Barrier callbacks registered. */
+static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
 	int (*completed)(void);
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
+	void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 	void (*cb_barrier)(void);
 	void (*fqs)(void);
 	int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.completed	= rcu_torture_completed,
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
+	.call		= call_rcu,
 	.cb_barrier	= rcu_barrier,
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.completed	= rcu_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu_expedited,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= synchronize_rcu_bh,
+	.call		= call_rcu_bh,
 	.cb_barrier	= rcu_barrier_bh,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu_bh,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu_bh_expedited,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
 	return srcu_batches_completed(&srcu_ctl);
 }
 
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+	call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
 static void srcu_torture_synchronize(void)
 {
 	synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
 	cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
 		       torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
-		cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+		cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
 			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
 			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
 	}
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock,
 	.completed	= srcu_torture_completed,
-	.deferred_free	= rcu_sync_torture_deferred_free,
+	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.stats		= srcu_torture_stats,
 	.name		= "srcu"
 };
 
+static struct rcu_torture_ops srcu_sync_ops = {
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.call		= NULL,
+	.cb_barrier	= NULL,
+	.stats		= srcu_torture_stats,
+	.name		= "srcu_sync"
+};
+
 static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
 {
 	return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock_raw,
 	.completed	= srcu_torture_completed,
-	.deferred_free	= rcu_sync_torture_deferred_free,
+	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.stats		= srcu_torture_stats,
 	.name		= "srcu_raw"
 };
 
+static struct rcu_torture_ops srcu_raw_sync_ops = {
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock_raw,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock_raw,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.call		= NULL,
+	.cb_barrier	= NULL,
+	.stats		= srcu_torture_stats,
+	.name		= "srcu_raw_sync"
+};
+
 static void srcu_torture_synchronize_expedited(void)
 {
 	synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
 	.completed	= srcu_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= srcu_torture_synchronize_expedited,
+	.call		= NULL,
 	.cb_barrier	= NULL,
 	.stats		= srcu_torture_stats,
 	.name		= "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
 		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
 		       "rtmbe: %d rtbke: %ld rtbre: %ld "
 		       "rtbf: %ld rtb: %ld nt: %ld "
-		       "onoff: %ld/%ld:%ld/%ld",
+		       "onoff: %ld/%ld:%ld/%ld "
+		       "barrier: %ld/%ld:%ld",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
 		       n_online_successes,
 		       n_online_attempts,
 		       n_offline_successes,
-		       n_offline_attempts);
+		       n_offline_attempts,
+		       n_barrier_successes,
+		       n_barrier_attempts,
+		       n_rcu_torture_barrier_error);
+	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+	    n_rcu_torture_barrier_error != 0 ||
 	    n_rcu_torture_boost_ktrerror != 0 ||
 	    n_rcu_torture_boost_rterror != 0 ||
-	    n_rcu_torture_boost_failure != 0)
-		cnt += sprintf(&page[cnt], " !!!");
-	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
-	if (i > 1) {
+	    n_rcu_torture_boost_failure != 0 ||
+	    i > 1) {
 		cnt += sprintf(&page[cnt], "!!! ");
 		atomic_inc(&n_rcu_torture_error);
 		WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
 
 	/* This must be outside of the mutex, otherwise deadlock! */
 	kthread_stop(t);
+	boost_tasks[cpu] = NULL;
 }
 
 static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
 		return;
 	VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
 	kthread_stop(onoff_task);
+	onoff_task = NULL;
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void
+static int
 rcu_torture_onoff_init(void)
 {
+	return 0;
 }
 
 static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
 		return;
 	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
 	kthread_stop(stall_task);
+	stall_task = NULL;
+}
+
+/* Callback function for RCU barrier testing. */
+void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+	atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+	long myid = (long)arg;
+	struct rcu_head rcu;
+
+	init_rcu_head_on_stack(&rcu);
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+	set_user_nice(current, 19);
+	do {
+		wait_event(barrier_cbs_wq[myid],
+			   atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+			   kthread_should_stop() ||
+			   fullstop != FULLSTOP_DONTSTOP);
+		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+			break;
+		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+		if (atomic_dec_and_test(&barrier_cbs_count))
+			wake_up(&barrier_wq);
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+	while (!kthread_should_stop())
+		schedule_timeout_interruptible(1);
+	cur_ops->cb_barrier();
+	destroy_rcu_head_on_stack(&rcu);
+	return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+	int i;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+	do {
+		atomic_set(&barrier_cbs_invoked, 0);
+		atomic_set(&barrier_cbs_count, n_barrier_cbs);
+		/* wake_up() path contains the required barriers. */
+		for (i = 0; i < n_barrier_cbs; i++)
+			wake_up(&barrier_cbs_wq[i]);
+		wait_event(barrier_wq,
+			   atomic_read(&barrier_cbs_count) == 0 ||
+			   kthread_should_stop() ||
+			   fullstop != FULLSTOP_DONTSTOP);
+		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+			break;
+		n_barrier_attempts++;
+		cur_ops->cb_barrier();
+		if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+			n_rcu_torture_barrier_error++;
+			WARN_ON_ONCE(1);
+		}
+		n_barrier_successes++;
+		schedule_timeout_interruptible(HZ / 10);
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+	while (!kthread_should_stop())
+		schedule_timeout_interruptible(1);
+	return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+	int i;
+	int ret;
+
+	if (n_barrier_cbs == 0)
+		return 0;
+	if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+		printk(KERN_ALERT "%s" TORTURE_FLAG
+		       " Call or barrier ops missing for %s,\n",
+		       torture_type, cur_ops->name);
+		printk(KERN_ALERT "%s" TORTURE_FLAG
+		       " RCU barrier testing omitted from run.\n",
+		       torture_type);
+		return 0;
+	}
+	atomic_set(&barrier_cbs_count, 0);
+	atomic_set(&barrier_cbs_invoked, 0);
+	barrier_cbs_tasks =
+		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+			GFP_KERNEL);
+	barrier_cbs_wq =
+		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+			GFP_KERNEL);
+	if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+		return -ENOMEM;
+	for (i = 0; i < n_barrier_cbs; i++) {
+		init_waitqueue_head(&barrier_cbs_wq[i]);
+		barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
+						   (void *)(long)i,
+						   "rcu_torture_barrier_cbs");
+		if (IS_ERR(barrier_cbs_tasks[i])) {
+			ret = PTR_ERR(barrier_cbs_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
+			barrier_cbs_tasks[i] = NULL;
+			return ret;
+		}
+	}
+	barrier_task = kthread_run(rcu_torture_barrier, NULL,
+				   "rcu_torture_barrier");
+	if (IS_ERR(barrier_task)) {
+		ret = PTR_ERR(barrier_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
+		barrier_task = NULL;
+	}
+	return 0;
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+	int i;
+
+	if (barrier_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
+		kthread_stop(barrier_task);
+		barrier_task = NULL;
+	}
+	if (barrier_cbs_tasks != NULL) {
+		for (i = 0; i < n_barrier_cbs; i++) {
+			if (barrier_cbs_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+				kthread_stop(barrier_cbs_tasks[i]);
+				barrier_cbs_tasks[i] = NULL;
+			}
+		}
+		kfree(barrier_cbs_tasks);
+		barrier_cbs_tasks = NULL;
+	}
+	if (barrier_cbs_wq != NULL) {
+		kfree(barrier_cbs_wq);
+		barrier_cbs_wq = NULL;
+	}
 }
 
 static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
 	unregister_reboot_notifier(&rcutorture_shutdown_nb);
+	rcu_torture_barrier_cleanup();
 	rcu_torture_stall_cleanup();
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
 		kthread_stop(shutdown_task);
 	}
+	shutdown_task = NULL;
 	rcu_torture_onoff_cleanup();
 
 	/* Wait for all RCU callbacks to fire.  */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
 
 	if (cur_ops->cleanup)
 		cur_ops->cleanup();
-	if (atomic_read(&n_rcu_torture_error))
+	if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
 		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
 	else if (n_online_successes != n_online_attempts ||
 		 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
 	int i;
 	int cpu;
 	int firsterr = 0;
+	int retval;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
 		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-		  &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
+		  &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
+		  &srcu_raw_sync_ops, &srcu_expedited_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
 	mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_free, 0);
 	atomic_set(&n_rcu_torture_mberror, 0);
 	atomic_set(&n_rcu_torture_error, 0);
+	n_rcu_torture_barrier_error = 0;
 	n_rcu_torture_boost_ktrerror = 0;
 	n_rcu_torture_boost_rterror = 0;
 	n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
 		test_boost_duration = 2;
 	if ((test_boost == 1 && cur_ops->can_boost) ||
 	    test_boost == 2) {
-		int retval;
 
 		boost_starttime = jiffies + test_boost_interval * HZ;
 		register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
-	rcu_torture_onoff_init();
+	i = rcu_torture_onoff_init();
+	if (i != 0) {
+		firsterr = i;
+		goto unwind;
+	}
 	register_reboot_notifier(&rcutorture_shutdown_nb);
-	rcu_torture_stall_init();
+	i = rcu_torture_stall_init();
+	if (i != 0) {
+		firsterr = i;
+		goto unwind;
+	}
+	retval = rcu_torture_barrier_init();
+	if (retval != 0) {
+		firsterr = retval;
+		goto unwind;
+	}
 	rcutorture_record_test_transition();
 	mutex_unlock(&fullstop_mutex);
 	return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab1..0da7b88d92d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+	.orphan_nxttail = &structname##_state.orphan_nxtlist, \
+	.orphan_donetail = &structname##_state.orphan_donelist, \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
 
+/* State information for rcu_barrier() and friends. */
+
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
 {
 	trace_rcu_utilization("Start context switch");
 	rcu_sched_qs(cpu);
-	rcu_preempt_note_context_switch(cpu);
 	trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
- * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Also record a quiescent state for this CPU for the current grace period.
- * Synchronization and interrupt disabling are not required because
- * this function executes in stop_machine() context.  Therefore, cleanup
- * operations that might block must be done later from the CPU_DEAD
- * notifier.
- *
- * Note that the outgoing CPU's bit has already been cleared in the
- * cpu_online_mask.  This allows us to randomly pick a callback
- * destination from the bits set in that mask.
+ * Send the specified CPU's RCU callbacks to the orphanage.  The
+ * specified CPU must be offline, and the caller must hold the
+ * ->onofflock.
  */
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+			  struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	int i;
-	unsigned long mask;
-	int receive_cpu = cpumask_any(cpu_online_mask);
-	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
-	RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
 
-	/* First, adjust the counts. */
+	/*
+	 * Orphan the callbacks.  First adjust the counts.  This is safe
+	 * because ->onofflock excludes _rcu_barrier()'s adoption of
+	 * the callbacks, thus no memory barrier is required.
+	 */
 	if (rdp->nxtlist != NULL) {
-		receive_rdp->qlen_lazy += rdp->qlen_lazy;
-		receive_rdp->qlen += rdp->qlen;
+		rsp->qlen_lazy += rdp->qlen_lazy;
+		rsp->qlen += rdp->qlen;
+		rdp->n_cbs_orphaned += rdp->qlen;
 		rdp->qlen_lazy = 0;
 		rdp->qlen = 0;
 	}
 
 	/*
-	 * Next, move ready-to-invoke callbacks to be invoked on some
-	 * other CPU.  These will not be required to pass through another
-	 * grace period:  They are done, regardless of CPU.
+	 * Next, move those callbacks still needing a grace period to
+	 * the orphanage, where some other CPU will pick them up.
+	 * Some of the callbacks might have gone partway through a grace
+	 * period, but that is too bad.  They get to start over because we
+	 * cannot assume that grace periods are synchronized across CPUs.
+	 * We don't bother updating the ->nxttail[] array yet, instead
+	 * we just reset the whole thing later on.
 	 */
-	if (rdp->nxtlist != NULL &&
-	    rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
-		struct rcu_head *oldhead;
-		struct rcu_head **oldtail;
-		struct rcu_head **newtail;
-
-		oldhead = rdp->nxtlist;
-		oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
-		rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
-		*rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
-		*receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
-		newtail = rdp->nxttail[RCU_DONE_TAIL];
-		for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
-			if (receive_rdp->nxttail[i] == oldtail)
-				receive_rdp->nxttail[i] = newtail;
-			if (rdp->nxttail[i] == newtail)
-				rdp->nxttail[i] = &rdp->nxtlist;
-		}
+	if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+		*rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
+		rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
+		*rdp->nxttail[RCU_DONE_TAIL] = NULL;
 	}
 
 	/*
-	 * Finally, put the rest of the callbacks at the end of the list.
-	 * The ones that made it partway through get to start over:  We
-	 * cannot assume that grace periods are synchronized across CPUs.
-	 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
-	 * this does not seem compelling.  Not yet, anyway.)
+	 * Then move the ready-to-invoke callbacks to the orphanage,
+	 * where some other CPU will pick them up.  These will not be
+	 * required to pass though another grace period: They are done.
 	 */
 	if (rdp->nxtlist != NULL) {
-		*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-		receive_rdp->nxttail[RCU_NEXT_TAIL] =
-				rdp->nxttail[RCU_NEXT_TAIL];
-		receive_rdp->n_cbs_adopted += rdp->qlen;
-		rdp->n_cbs_orphaned += rdp->qlen;
-
-		rdp->nxtlist = NULL;
-		for (i = 0; i < RCU_NEXT_SIZE; i++)
-			rdp->nxttail[i] = &rdp->nxtlist;
+		*rsp->orphan_donetail = rdp->nxtlist;
+		rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
 	}
 
+	/* Finally, initialize the rcu_data structure's list to empty.  */
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage.  The caller must hold the ->onofflock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+	int i;
+	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+
 	/*
-	 * Record a quiescent state for the dying CPU.  This is safe
-	 * only because we have already cleared out the callbacks.
-	 * (Otherwise, the RCU core might try to schedule the invocation
-	 * of callbacks on this now-offline CPU, which would be bad.)
+	 * If there is an rcu_barrier() operation in progress, then
+	 * only the task doing that operation is permitted to adopt
+	 * callbacks.  To do otherwise breaks rcu_barrier() and friends
+	 * by causing them to fail to wait for the callbacks in the
+	 * orphanage.
 	 */
-	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	if (rsp->rcu_barrier_in_progress &&
+	    rsp->rcu_barrier_in_progress != current)
+		return;
+
+	/* Do the accounting first. */
+	rdp->qlen_lazy += rsp->qlen_lazy;
+	rdp->qlen += rsp->qlen;
+	rdp->n_cbs_adopted += rsp->qlen;
+	rsp->qlen_lazy = 0;
+	rsp->qlen = 0;
+
+	/*
+	 * We do not need a memory barrier here because the only way we
+	 * can get here if there is an rcu_barrier() in flight is if
+	 * we are the task doing the rcu_barrier().
+	 */
+
+	/* First adopt the ready-to-invoke callbacks. */
+	if (rsp->orphan_donelist != NULL) {
+		*rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+		*rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+		for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+			if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+				rdp->nxttail[i] = rsp->orphan_donetail;
+		rsp->orphan_donelist = NULL;
+		rsp->orphan_donetail = &rsp->orphan_donelist;
+	}
+
+	/* And then adopt the callbacks that still need a grace period. */
+	if (rsp->orphan_nxtlist != NULL) {
+		*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+		rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+		rsp->orphan_nxtlist = NULL;
+		rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+	}
+}
+
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+	RCU_TRACE(unsigned long mask);
+	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+	RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+
+	RCU_TRACE(mask = rdp->grpmask);
 	trace_rcu_grace_period(rsp->name,
 			       rnp->gpnum + 1 - !!(rnp->qsmask & mask),
 			       "cpuofl");
-	rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
-	/* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
 }
 
 /*
  * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup.
+ * this fact from process context.  Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them, if there is no _rcu_barrier() instance running.
  * There can only be one CPU hotplug operation at a time, so no other
  * CPU can be attempting to update rcu_cpu_kthread_task.
  */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	unsigned long mask;
 	int need_report = 0;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */
+	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_stop_cpu_kthread(cpu);
 	rcu_node_kthread_setaffinity(rnp, -1);
 
-	/* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
+	/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
 
 	/* Exclude any attempts to start a new grace period. */
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 
+	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+	rcu_adopt_orphan_cbs(rsp);
+
 	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
 	mask = rdp->grpmask;	/* rnp->grplo is constant. */
 	do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 			    rcu_is_callbacks_kthread());
 
 	/* Update count, and requeue any remaining callbacks. */
-	rdp->qlen_lazy -= count_lazy;
-	rdp->qlen -= count;
-	rdp->n_cbs_invoked += count;
 	if (list != NULL) {
 		*tail = rdp->nxtlist;
 		rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 			else
 				break;
 	}
+	smp_mb(); /* List handling before counting for rcu_barrier(). */
+	rdp->qlen_lazy -= count_lazy;
+	rdp->qlen -= count;
+	rdp->n_cbs_invoked += count;
 
 	/* Reinstate batch limit if we have worked down the excess. */
 	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	rdp = this_cpu_ptr(rsp->rda);
 
 	/* Add the callback to our list. */
-	*rdp->nxttail[RCU_NEXT_TAIL] = head;
-	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 	rdp->qlen++;
 	if (lazy)
 		rdp->qlen_lazy++;
+	else
+		rcu_idle_count_callbacks_posted();
+	smp_mb();  /* Count before adding callback for rcu_barrier(). */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 
 	if (__is_kfree_rcu_offset((unsigned long)func))
 		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ *
+ * Of course, sampling num_online_cpus() with preemption enabled can
+ * give erroneous results if there are concurrent CPU-hotplug operations.
+ * For example, given a demonic sequence of preemptions in num_online_cpus()
+ * and CPU-hotplug operations, there could be two or more CPUs online at
+ * all times, but num_online_cpus() might well return one (or even zero).
+ *
+ * However, all such demonic sequences require at least one CPU-offline
+ * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer
+ * is only a problem if there is an RCU read-side critical section executing
+ * throughout.  But RCU-sched and RCU-bh read-side critical sections
+ * disable either preemption or bh, which prevents a CPU from going offline.
+ * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
+ * that there is only one CPU when in fact there was more than one throughout
+ * is when there were no RCU readers in the system.  If there are no
+ * RCU readers, the grace period by definition can be of zero length,
+ * regardless of the number of online CPUs.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+	might_sleep();  /* Check for RCU read-side critical section. */
+	return num_online_cpus() <= 1;
+}
+
 /**
  * synchronize_sched - wait until an rcu-sched grace period has elapsed.
  *
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
 	       rcu_preempt_cpu_has_callbacks(cpu);
 }
 
-static DEFINE_PER_CPU(struct rcu_head
author	Ingo Molnar <mingo@kernel.org>	2012-05-14 08:41:20 +0200
committer	Ingo Molnar <mingo@kernel.org>	2012-05-14 08:41:46 +0200
commit	2d84e023cb5ec00403ff5d447533c6fd58fcc7ff (patch)
tree	cb10d9a568ebb4be8593821a6f205efedf2f4ddd /kernel
parent	9ff00d58a915b6747ba2e843ab2d04c712b4dc32 (diff)
parent	dc36be4419311fd57becdf54bfeef6bd04a6741d (diff)