diff options
-rw-r--r-- | Documentation/RCU/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/RCU/checklist.txt | 12 | ||||
-rw-r--r-- | Documentation/RCU/rcu_dereference.txt | 371 | ||||
-rw-r--r-- | Documentation/RCU/stallwarn.txt | 2 | ||||
-rw-r--r-- | Documentation/RCU/whatisRCU.txt | 55 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 5 | ||||
-rw-r--r-- | kernel/rcu/tiny_plugin.h | 8 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 249 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 3 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 120 | ||||
-rw-r--r-- | kernel/softirq.c | 4 |
11 files changed, 626 insertions, 205 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index fa57139f50b..f773a264ae0 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -12,6 +12,8 @@ lockdep-splat.txt - RCU Lockdep splats explained. NMI-RCU.txt - Using RCU to Protect Dynamic NMI Handlers +rcu_dereference.txt + - Proper care and feeding of return values from rcu_dereference() rcubarrier.txt - RCU and Unloadable Modules rculist_nulls.txt diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 9d10d1db16a..877947130eb 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -114,12 +114,16 @@ over a rather long period of time, but improvements are always welcome! http://www.openvms.compaq.com/wizard/wiz_2637.html The rcu_dereference() primitive is also an excellent - documentation aid, letting the person reading the code - know exactly which pointers are protected by RCU. + documentation aid, letting the person reading the + code know exactly which pointers are protected by RCU. Please note that compilers can also reorder code, and they are becoming increasingly aggressive about doing - just that. The rcu_dereference() primitive therefore - also prevents destructive compiler optimizations. + just that. The rcu_dereference() primitive therefore also + prevents destructive compiler optimizations. However, + with a bit of devious creativity, it is possible to + mishandle the return value from rcu_dereference(). + Please see rcu_dereference.txt in this directory for + more information. The rcu_dereference() primitive is used by the various "_rcu()" list-traversal primitives, such diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt new file mode 100644 index 00000000000..ceb05da5a5a --- /dev/null +++ b/Documentation/RCU/rcu_dereference.txt @@ -0,0 +1,371 @@ +PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() + +Most of the time, you can use values from rcu_dereference() or one of +the similar primitives without worries. Dereferencing (prefix "*"), +field selection ("->"), assignment ("="), address-of ("&"), addition and +subtraction of constants, and casts all work quite naturally and safely. + +It is nevertheless possible to get into trouble with other operations. +Follow these rules to keep your RCU code working properly: + +o You must use one of the rcu_dereference() family of primitives + to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU + will complain. Worse yet, your code can see random memory-corruption + bugs due to games that compilers and DEC Alpha can play. + Without one of the rcu_dereference() primitives, compilers + can reload the value, and won't your code have fun with two + different values for a single pointer! Without rcu_dereference(), + DEC Alpha can load a pointer, dereference that pointer, and + return data preceding initialization that preceded the store of + the pointer. + + In addition, the volatile cast in rcu_dereference() prevents the + compiler from deducing the resulting pointer value. Please see + the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH" + for an example where the compiler can in fact deduce the exact + value of the pointer, and thus cause misordering. + +o Do not use single-element RCU-protected arrays. The compiler + is within its right to assume that the value of an index into + such an array must necessarily evaluate to zero. The compiler + could then substitute the constant zero for the computation, so + that the array index no longer depended on the value returned + by rcu_dereference(). If the array index no longer depends + on rcu_dereference(), then both the compiler and the CPU + are within their rights to order the array access before the + rcu_dereference(), which can cause the array access to return + garbage. + +o Avoid cancellation when using the "+" and "-" infix arithmetic + operators. For example, for a given variable "x", avoid + "(x-x)". There are similar arithmetic pitfalls from other + arithmetic operatiors, such as "(x*0)", "(x/(x+1))" or "(x%1)". + The compiler is within its rights to substitute zero for all of + these expressions, so that subsequent accesses no longer depend + on the rcu_dereference(), again possibly resulting in bugs due + to misordering. + + Of course, if "p" is a pointer from rcu_dereference(), and "a" + and "b" are integers that happen to be equal, the expression + "p+a-b" is safe because its value still necessarily depends on + the rcu_dereference(), thus maintaining proper ordering. + +o Avoid all-zero operands to the bitwise "&" operator, and + similarly avoid all-ones operands to the bitwise "|" operator. + If the compiler is able to deduce the value of such operands, + it is within its rights to substitute the corresponding constant + for the bitwise operation. Once again, this causes subsequent + accesses to no longer depend on the rcu_dereference(), causing + bugs due to misordering. + + Please note that single-bit operands to bitwise "&" can also + be dangerous. At this point, the compiler knows that the + resulting value can only take on one of two possible values. + Therefore, a very small amount of additional information will + allow the compiler to deduce the exact value, which again can + result in misordering. + +o If you are using RCU to protect JITed functions, so that the + "()" function-invocation operator is applied to a value obtained + (directly or indirectly) from rcu_dereference(), you may need to + interact directly with the hardware to flush instruction caches. + This issue arises on some systems when a newly JITed function is + using the same memory that was used by an earlier JITed function. + +o Do not use the results from the boolean "&&" and "||" when + dereferencing. For example, the following (rather improbable) + code is buggy: + + int a[2]; + int index; + int force_zero_index = 1; + + ... + + r1 = rcu_dereference(i1) + r2 = a[r1 && force_zero_index]; /* BUGGY!!! */ + + The reason this is buggy is that "&&" and "||" are often compiled + using branches. While weak-memory machines such as ARM or PowerPC + do order stores after such branches, they can speculate loads, + which can result in misordering bugs. + +o Do not use the results from relational operators ("==", "!=", + ">", ">=", "<", or "<=") when dereferencing. For example, + the following (quite strange) code is buggy: + + int a[2]; + int index; + int flip_index = 0; + + ... + + r1 = rcu_dereference(i1) + r2 = a[r1 != flip_index]; /* BUGGY!!! */ + + As before, the reason this is buggy is that relational operators + are often compiled using branches. And as before, although + weak-memory machines such as ARM or PowerPC do order stores + after such branches, but can speculate loads, which can again + result in misordering bugs. + +o Be very careful about comparing pointers obtained from + rcu_dereference() against non-NULL values. As Linus Torvalds + explained, if the two pointers are equal, the compiler could + substitute the pointer you are comparing against for the pointer + obtained from rcu_dereference(). For example: + + p = rcu_dereference(gp); + if (p == &default_struct) + do_default(p->a); + + Because the compiler now knows that the value of "p" is exactly + the address of the variable "default_struct", it is free to + transform this code into the following: + + p = rcu_dereference(gp); + if (p == &default_struct) + do_default(default_struct.a); + + On ARM and Power hardware, the load from "default_struct.a" + can now be speculated, such that it might happen before the + rcu_dereference(). This could result in bugs due to misordering. + + However, comparisons are OK in the following cases: + + o The comparison was against the NULL pointer. If the + compiler knows that the pointer is NULL, you had better + not be dereferencing it anyway. If the comparison is + non-equal, the compiler is none the wiser. Therefore, + it is safe to compare pointers from rcu_dereference() + against NULL pointers. + + o The pointer is never dereferenced after being compared. + Since there are no subsequent dereferences, the compiler + cannot use anything it learned from the comparison + to reorder the non-existent subsequent dereferences. + This sort of comparison occurs frequently when scanning + RCU-protected circular linked lists. + + o The comparison is against a pointer that references memory + that was initialized "a long time ago." The reason + this is safe is that even if misordering occurs, the + misordering will not affect the accesses that follow + the comparison. So exactly how long ago is "a long + time ago"? Here are some possibilities: + + o Compile time. + + o Boot time. + + o Module-init time for module code. + + o Prior to kthread creation for kthread code. + + o During some prior acquisition of the lock that + we now hold. + + o Before mod_timer() time for a timer handler. + + There are many other possibilities involving the Linux + kernel's wide array of primitives that cause code to + be invoked at a later time. + + o The pointer being compared against also came from + rcu_dereference(). In this case, both pointers depend + on one rcu_dereference() or another, so you get proper + ordering either way. + + That said, this situation can make certain RCU usage + bugs more likely to happen. Which can be a good thing, + at least if they happen during testing. An example + of such an RCU usage bug is shown in the section titled + "EXAMPLE OF AMPLIFIED RCU-USAGE BUG". + + o All of the accesses following the comparison are stores, + so that a control dependency preserves the needed ordering. + That said, it is easy to get control dependencies wrong. + Please see the "CONTROL DEPENDENCIES" section of + Documentation/memory-barriers.txt for more details. + + o The pointers are not equal -and- the compiler does + not have enough information to deduce the value of the + pointer. Note that the volatile cast in rcu_dereference() + will normally prevent the compiler from knowing too much. + +o Disable any value-speculation optimizations that your compiler + might provide, especially if you are making use of feedback-based + optimizations that take data collected from prior runs. Such + value-speculation optimizations reorder operations by design. + + There is one exception to this rule: Value-speculation + optimizations that leverage the branch-prediction hardware are + safe on strongly ordered systems (such as x86), but not on weakly + ordered systems (such as ARM or Power). Choose your compiler + command-line options wisely! + + +EXAMPLE OF AMPLIFIED RCU-USAGE BUG + +Because updaters can run concurrently with RCU readers, RCU readers can +see stale and/or inconsistent values. If RCU readers need fresh or +consistent values, which they sometimes do, they need to take proper +precautions. To see this, consider the following code fragment: + + struct foo { + int a; + int b; + int c; + }; + struct foo *gp1; + struct foo *gp2; + + void updater(void) + { + struct foo *p; + + p = kmalloc(...); + if (p == NULL) + deal_with_it(); + p->a = 42; /* Each field in its own cache line. */ + p->b = 43; + p->c = 44; + rcu_assign_pointer(gp1, p); + p->b = 143; + p->c = 144; + rcu_assign_pointer(gp2, p); + } + + void reader(void) + { + struct foo *p; + struct foo *q; + int r1, r2; + + p = rcu_dereference(gp2); + if (p == NULL) + return; + r1 = p->b; /* Guaranteed to get 143. */ + q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ + if (p == q) { + /* The compiler decides that q->c is same as p->c. */ + r2 = p->c; /* Could get 44 on weakly order system. */ + } + do_something_with(r1, r2); + } + +You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible, +but you should not be. After all, the updater might have been invoked +a second time between the time reader() loaded into "r1" and the time +that it loaded into "r2". The fact that this same result can occur due +to some reordering from the compiler and CPUs is beside the point. + +But suppose that the reader needs a consistent view? + +Then one approach is to use locking, for example, as follows: + + struct foo { + int a; + int b; + int c; + spinlock_t lock; + }; + struct foo *gp1; + struct foo *gp2; + + void updater(void) + { + struct foo *p; + + p = kmalloc(...); + if (p == NULL) + deal_with_it(); + spin_lock(&p->lock); + p->a = 42; /* Each field in its own cache line. */ + p->b = 43; + p->c = 44; + spin_unlock(&p->lock); + rcu_assign_pointer(gp1, p); + spin_lock(&p->lock); + p->b = 143; + p->c = 144; + spin_unlock(&p->lock); + rcu_assign_pointer(gp2, p); + } + + void reader(void) + { + struct foo *p; + struct foo *q; + int r1, r2; + + p = rcu_dereference(gp2); + if (p == NULL) + return; + spin_lock(&p->lock); + r1 = p->b; /* Guaranteed to get 143. */ + q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ + if (p == q) { + /* The compiler decides that q->c is same as p->c. */ + r2 = p->c; /* Locking guarantees r2 == 144. */ + } + spin_unlock(&p->lock); + do_something_with(r1, r2); + } + +As always, use the right tool for the job! + + +EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH + +If a pointer obtained from rcu_dereference() compares not-equal to some +other pointer, the compiler normally has no clue what the value of the +first pointer might be. This lack of knowledge prevents the compiler +from carrying out optimizations that otherwise might destroy the ordering +guarantees that RCU depends on. And the volatile cast in rcu_dereference() +should prevent the compiler from guessing the value. + +But without rcu_dereference(), the compiler knows more than you might +expect. Consider the following code fragment: + + struct foo { + int a; + int b; + }; + static struct foo variable1; + static struct foo variable2; + static struct foo *gp = &variable1; + + void updater(void) + { + initialize_foo(&variable2); + rcu_assign_pointer(gp, &variable2); + /* + * The above is the only store to gp in this translation unit, + * and the address of gp is not exported in any way. + */ + } + + int reader(void) + { + struct foo *p; + + p = gp; + barrier(); + if (p == &variable1) + return p->a; /* Must be variable1.a. */ + else + return p->b; /* Must be variable2.b. */ + } + +Because the compiler can see all stores to "gp", it knows that the only +possible values of "gp" are "variable1" on the one hand and "variable2" +on the other. The comparison in reader() therefore tells the compiler +the exact value of "p" even in the not-equals case. This allows the +compiler to make the return values independent of the load from "gp", +in turn destroying the ordering between this load and the loads of the +return values. This can result in "p->b" returning pre-initialization +garbage values. + +In short, rcu_dereference() is -not- optional when you are going to +dereference the resulting pointer. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 6f3a0057548..68fe3ad2701 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -24,7 +24,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT timing of the next warning for the current stall. Stall-warning messages may be enabled and disabled completely via - /sys/module/rcutree/parameters/rcu_cpu_stall_suppress. + /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. CONFIG_RCU_CPU_STALL_VERBOSE diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 0f0fb7c432c..49b8551a3b6 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -326,11 +326,11 @@ used as follows: a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() call_rcu() rcu_dereference() -b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() - rcu_dereference_bh() +b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() + call_rcu_bh() rcu_dereference_bh() c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched() - preempt_disable() / preempt_enable() + call_rcu_sched() preempt_disable() / preempt_enable() local_irq_save() / local_irq_restore() hardirq enter / hardirq exit NMI enter / NMI exit @@ -794,10 +794,22 @@ in docbook. Here is the list, by category. RCU list traversal: + list_entry_rcu + list_first_entry_rcu + list_next_rcu list_for_each_entry_rcu + list_for_each_entry_continue_rcu + hlist_first_rcu + hlist_next_rcu + hlist_pprev_rcu hlist_for_each_entry_rcu + hlist_for_each_entry_rcu_bh + hlist_for_each_entry_continue_rcu + hlist_for_each_entry_continue_rcu_bh + hlist_nulls_first_rcu hlist_nulls_for_each_entry_rcu - list_for_each_entry_continue_rcu + hlist_bl_first_rcu + hlist_bl_for_each_entry_rcu RCU pointer/list update: @@ -806,28 +818,38 @@ RCU pointer/list update: list_add_tail_rcu list_del_rcu list_replace_rcu - hlist_del_rcu hlist_add_after_rcu hlist_add_before_rcu hlist_add_head_rcu + hlist_del_rcu + hlist_del_init_rcu hlist_replace_rcu list_splice_init_rcu() + hlist_nulls_del_init_rcu + hlist_nulls_del_rcu + hlist_nulls_add_head_rcu + hlist_bl_add_head_rcu + hlist_bl_del_init_rcu + hlist_bl_del_rcu + hlist_bl_set_first_rcu RCU: Critical sections Grace period Barrier rcu_read_lock synchronize_net rcu_barrier rcu_read_unlock synchronize_rcu rcu_dereference synchronize_rcu_expedited - call_rcu - kfree_rcu - + rcu_read_lock_held call_rcu + rcu_dereference_check kfree_rcu + rcu_dereference_protected bh: Critical sections Grace period Barrier rcu_read_lock_bh call_rcu_bh rcu_barrier_bh rcu_read_unlock_bh synchronize_rcu_bh rcu_dereference_bh synchronize_rcu_bh_expedited - + rcu_dereference_bh_check + rcu_dereference_bh_protected + rcu_read_lock_bh_held sched: Critical sections Grace period Barrier @@ -835,7 +857,12 @@ sched: Critical sections Grace period Barrier rcu_read_unlock_sched call_rcu_sched [preempt_disable] synchronize_sched_expedited [and friends] + rcu_read_lock_sched_notrace + rcu_read_unlock_sched_notrace rcu_dereference_sched + rcu_dereference_sched_check + rcu_dereference_sched_protected + rcu_read_lock_sched_held SRCU: Critical sections Grace period Barrier @@ -843,6 +870,8 @@ SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu srcu_barrier srcu_read_unlock call_srcu srcu_dereference synchronize_srcu_expedited + srcu_dereference_check + srcu_read_lock_held SRCU: Initialization/cleanup init_srcu_struct @@ -850,9 +879,13 @@ SRCU: Initialization/cleanup All: lockdep-checked RCU-protected pointer access - rcu_dereference_check - rcu_dereference_protected + rcu_access_index rcu_access_pointer + rcu_dereference_index_check + rcu_dereference_raw + rcu_lockdep_assert + rcu_sleep_check + RCU_NONIDLE See the comment headers in the source code (or the docbook generated from them) for more information. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 97cc8d6679b..9ccd644c123 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -383,7 +383,7 @@ extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; -extern int debug_lockdep_rcu_enabled(void); +int debug_lockdep_rcu_enabled(void); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? @@ -1004,6 +1004,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer -after- you have completely initialized * the reader-accessible portions of the linked structure. + * + * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no + * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 43152852056..858c5656912 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) return; rcp->ticks_this_gp++; j = jiffies; - js = rcp->jiffies_stall; + js = ACCESS_ONCE(rcp->jiffies_stall); if (*rcp->curtail && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, @@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) dump_stack(); } if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void check_cpu_stalls(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93e64381aa2..3bbe48939e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); -static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp, bool *isidle, @@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Force a quiescent state. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(rcu_state); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* * Force a quiescent state for RCU BH. */ void rcu_bh_force_quiescent_state(void) @@ -372,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) } /* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +/* + * Is there any need for future grace periods? + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_future_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; + int *fp = &rnp->need_future_gp[idx]; + + return ACCESS_ONCE(*fp); +} + +/* * Does the current CPU require a not-yet-started grace period? * The caller must have disabled interrupts to prevent races with * normal callback registry. @@ -383,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ - if (rcu_nocb_needs_gp(rsp)) + if (rcu_future_needs_gp(rsp)) return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ @@ -398,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ - return &rsp->node[0]; -} - -/* * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * * If the new value of the ->dynticks_nesting counter now is zero, @@ -806,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - return (rdp->dynticks_snap & 0x1) == 0; + if ((rdp->dynticks_snap & 0x1) == 0) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + return 1; + } else { + return 0; + } } /* @@ -899,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - rsp->jiffies_stall = j + j1; + ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; } @@ -938,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -980,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed, totqlen); + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -995,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -1019,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); + pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + jiffies - rsp->gp_start, + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = jiffies + + if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1110,7 +1133,7 @@ void rcu_cpu_stall_reset(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rsp->jiffies_stall = jiffies + ULONG_MAX / 2; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; } /* @@ -1171,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start some future grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. + * rcu_node structure's ->need_future_gp field. Returns true if there + * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock. */ -static unsigned long __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +static bool __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long *c_out) { unsigned long c; int i; + bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); /* @@ -1190,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - return c; + goto out; } /* @@ -1204,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - return c; + goto out; } /* @@ -1245,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: if (rnp != rnp_root) raw_spin_unlock(&rnp_root->lock); - return c; +out: + if (c_out != NULL) + *c_out = c; + return ret; } /* @@ -1274,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* + * Awaken the grace-period kthread for the specified flavor of RCU. + * Don't do a self-awaken, and don't bother awakening when there is + * nothing for the grace-period kthread to do (as in several CPUs + * raced to awaken, and we lost), and finally don't try to awaken + * a kthread that has not yet been created. + */ +static void rcu_gp_kthread_wake(struct rcu_state *rsp) +{ + if (current == rsp->gp_kthread || + !ACCESS_ONCE(rsp->gp_flags) || + !rsp->gp_kthread) + return; + wake_up(&rsp->gp_wq); +} + +/* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any * callbacks that were previously assigned a ->completed number that has * since proven to be too conservative, which can happen if callbacks get * assigned a ->completed number while RCU is idle, but with reference to * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. + * not hurt to call it repeatedly. Returns an flag saying that we should + * awaken the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long c; int i; + bool ret; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Starting from the sublist containing the callbacks most @@ -1321,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * be grouped into. */ if (++i >= RCU_NEXT_TAIL) - return; + return false; /* * Assign all subsequent callbacks' ->completed number to the next @@ -1333,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxtcompleted[i] = c; } /* Record any needed additional grace periods. */ - rcu_start_future_gp(rnp, rdp); + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + return ret; } /* @@ -1348,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... + * Returns true if the RCU grace-period kthread needs to be awakened. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { int i, j; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Find all callbacks whose ->completed numbers indicate that they @@ -1382,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* Classify any remaining callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + return rcu_accelerate_cbs(rsp, rnp, rdp); } /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. + * Returns true if the grace-period kthread needs to be awakened. */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { + bool ret; + /* Han |