]>
Commit | Line | Data |
---|---|---|
621934ee PM |
1 | /* |
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2 of the License, or | |
7 | * (at your option) any later version. | |
8 | * | |
9 | * This program is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | * GNU General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU General Public License | |
15 | * along with this program; if not, write to the Free Software | |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
17 | * | |
18 | * Copyright (C) IBM Corporation, 2006 | |
19 | * | |
20 | * Author: Paul McKenney <paulmck@us.ibm.com> | |
21 | * | |
22 | * For detailed explanation of Read-Copy Update mechanism see - | |
23 | * Documentation/RCU/ *.txt | |
24 | * | |
25 | */ | |
26 | ||
9984de1a | 27 | #include <linux/export.h> |
621934ee PM |
28 | #include <linux/mutex.h> |
29 | #include <linux/percpu.h> | |
30 | #include <linux/preempt.h> | |
31 | #include <linux/rcupdate.h> | |
32 | #include <linux/sched.h> | |
621934ee | 33 | #include <linux/smp.h> |
46fdb093 | 34 | #include <linux/delay.h> |
621934ee PM |
35 | #include <linux/srcu.h> |
36 | ||
632ee200 PM |
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | |
39 | sp->completed = 0; | |
40 | mutex_init(&sp->mutex); | |
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | |
43 | } | |
44 | ||
45 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | |
46 | ||
47 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | |
48 | struct lock_class_key *key) | |
49 | { | |
632ee200 PM |
50 | /* Don't re-initialize a lock while it is held. */ |
51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | |
52 | lockdep_init_map(&sp->dep_map, name, key, 0); | |
632ee200 PM |
53 | return init_srcu_struct_fields(sp); |
54 | } | |
55 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | |
56 | ||
57 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | |
58 | ||
621934ee PM |
59 | /** |
60 | * init_srcu_struct - initialize a sleep-RCU structure | |
61 | * @sp: structure to initialize. | |
62 | * | |
63 | * Must invoke this on a given srcu_struct before passing that srcu_struct | |
64 | * to any other function. Each srcu_struct represents a separate domain | |
65 | * of SRCU protection. | |
66 | */ | |
e6a92013 | 67 | int init_srcu_struct(struct srcu_struct *sp) |
621934ee | 68 | { |
632ee200 | 69 | return init_srcu_struct_fields(sp); |
621934ee | 70 | } |
0cd397d3 | 71 | EXPORT_SYMBOL_GPL(init_srcu_struct); |
621934ee | 72 | |
632ee200 PM |
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | ||
b52ce066 LJ |
75 | /* |
76 | * Returns approximate total of the readers' ->seq[] values for the | |
77 | * rank of per-CPU counters specified by idx. | |
78 | */ | |
79 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | |
80 | { | |
81 | int cpu; | |
82 | unsigned long sum = 0; | |
83 | unsigned long t; | |
84 | ||
85 | for_each_possible_cpu(cpu) { | |
86 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | |
87 | sum += t; | |
88 | } | |
89 | return sum; | |
90 | } | |
91 | ||
621934ee | 92 | /* |
cef50120 | 93 | * Returns approximate number of readers active on the specified rank |
b52ce066 | 94 | * of the per-CPU ->c[] counters. |
621934ee | 95 | */ |
cef50120 PM |
96 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) |
97 | { | |
98 | int cpu; | |
99 | unsigned long sum = 0; | |
100 | unsigned long t; | |
621934ee | 101 | |
cef50120 PM |
102 | for_each_possible_cpu(cpu) { |
103 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); | |
104 | sum += t; | |
cef50120 | 105 | } |
b52ce066 | 106 | return sum; |
cef50120 PM |
107 | } |
108 | ||
109 | /* | |
b52ce066 LJ |
110 | * Return true if the number of pre-existing readers is determined to |
111 | * be stably zero. An example unstable zero can occur if the call | |
112 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | |
113 | * but due to task migration, sees the corresponding __srcu_read_unlock() | |
114 | * decrement. This can happen because srcu_readers_active_idx() takes | |
115 | * time to sum the array, and might in fact be interrupted or preempted | |
116 | * partway through the summation. | |
cef50120 PM |
117 | */ |
118 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | |
621934ee | 119 | { |
b52ce066 LJ |
120 | unsigned long seq; |
121 | ||
122 | seq = srcu_readers_seq_idx(sp, idx); | |
123 | ||
124 | /* | |
125 | * The following smp_mb() A pairs with the smp_mb() B located in | |
126 | * __srcu_read_lock(). This pairing ensures that if an | |
127 | * __srcu_read_lock() increments its counter after the summation | |
128 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | |
129 | * critical section will see any changes made prior to the start | |
130 | * of the current SRCU grace period. | |
131 | * | |
132 | * Also, if the above call to srcu_readers_seq_idx() saw the | |
133 | * increment of ->seq[], then the call to srcu_readers_active_idx() | |
134 | * must see the increment of ->c[]. | |
135 | */ | |
136 | smp_mb(); /* A */ | |
621934ee | 137 | |
cef50120 PM |
138 | /* |
139 | * Note that srcu_readers_active_idx() can incorrectly return | |
140 | * zero even though there is a pre-existing reader throughout. | |
141 | * To see this, suppose that task A is in a very long SRCU | |
142 | * read-side critical section that started on CPU 0, and that | |
b52ce066 | 143 | * no other reader exists, so that the sum of the counters |
cef50120 PM |
144 | * is equal to one. Then suppose that task B starts executing |
145 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | |
146 | * task C starts reading on CPU 0, so that its increment is not | |
147 | * summed, but finishes reading on CPU 2, so that its decrement | |
148 | * -is- summed. Then when task B completes its sum, it will | |
149 | * incorrectly get zero, despite the fact that task A has been | |
150 | * in its SRCU read-side critical section the whole time. | |
151 | * | |
152 | * We therefore do a validation step should srcu_readers_active_idx() | |
153 | * return zero. | |
154 | */ | |
155 | if (srcu_readers_active_idx(sp, idx) != 0) | |
156 | return false; | |
157 | ||
158 | /* | |
b52ce066 LJ |
159 | * The remainder of this function is the validation step. |
160 | * The following smp_mb() D pairs with the smp_mb() C in | |
161 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | |
162 | * by srcu_readers_active_idx() above, then any destructive | |
163 | * operation performed after the grace period will happen after | |
164 | * the corresponding SRCU read-side critical section. | |
cef50120 | 165 | * |
b52ce066 LJ |
166 | * Note that there can be at most NR_CPUS worth of readers using |
167 | * the old index, which is not enough to overflow even a 32-bit | |
168 | * integer. (Yes, this does mean that systems having more than | |
169 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | |
170 | * the sum of the ->seq[] counters cannot possibly overflow. | |
171 | * Therefore, the only way that the return values of the two | |
172 | * calls to srcu_readers_seq_idx() can be equal is if there were | |
173 | * no increments of the corresponding rank of ->seq[] counts | |
174 | * in the interim. But the missed-increment scenario laid out | |
175 | * above includes an increment of the ->seq[] counter by | |
176 | * the corresponding __srcu_read_lock(). Therefore, if this | |
177 | * scenario occurs, the return values from the two calls to | |
178 | * srcu_readers_seq_idx() will differ, and thus the validation | |
179 | * step below suffices. | |
cef50120 | 180 | */ |
b52ce066 LJ |
181 | smp_mb(); /* D */ |
182 | ||
183 | return srcu_readers_seq_idx(sp, idx) == seq; | |
621934ee PM |
184 | } |
185 | ||
186 | /** | |
187 | * srcu_readers_active - returns approximate number of readers. | |
188 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | |
189 | * | |
190 | * Note that this is not an atomic primitive, and can therefore suffer | |
191 | * severe errors when invoked on an active srcu_struct. That said, it | |
192 | * can be useful as an error check at cleanup time. | |
193 | */ | |
bb695170 | 194 | static int srcu_readers_active(struct srcu_struct *sp) |
621934ee | 195 | { |
dc879175 LJ |
196 | int cpu; |
197 | unsigned long sum = 0; | |
198 | ||
199 | for_each_possible_cpu(cpu) { | |
200 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | |
201 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | |
202 | } | |
203 | return sum; | |
621934ee PM |
204 | } |
205 | ||
206 | /** | |
207 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | |
208 | * @sp: structure to clean up. | |
209 | * | |
210 | * Must invoke this after you are finished using a given srcu_struct that | |
211 | * was initialized via init_srcu_struct(), else you leak memory. | |
212 | */ | |
213 | void cleanup_srcu_struct(struct srcu_struct *sp) | |
214 | { | |
215 | int sum; | |
216 | ||
217 | sum = srcu_readers_active(sp); | |
218 | WARN_ON(sum); /* Leakage unless caller handles error. */ | |
219 | if (sum != 0) | |
220 | return; | |
221 | free_percpu(sp->per_cpu_ref); | |
222 | sp->per_cpu_ref = NULL; | |
223 | } | |
0cd397d3 | 224 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
621934ee | 225 | |
632ee200 | 226 | /* |
621934ee PM |
227 | * Counts the new reader in the appropriate per-CPU element of the |
228 | * srcu_struct. Must be called from process context. | |
229 | * Returns an index that must be passed to the matching srcu_read_unlock(). | |
230 | */ | |
632ee200 | 231 | int __srcu_read_lock(struct srcu_struct *sp) |
621934ee PM |
232 | { |
233 | int idx; | |
234 | ||
235 | preempt_disable(); | |
cef50120 PM |
236 | idx = rcu_dereference_index_check(sp->completed, |
237 | rcu_read_lock_sched_held()) & 0x1; | |
b52ce066 | 238 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
cef50120 | 239 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
b52ce066 | 240 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; |
621934ee PM |
241 | preempt_enable(); |
242 | return idx; | |
243 | } | |
632ee200 | 244 | EXPORT_SYMBOL_GPL(__srcu_read_lock); |
621934ee | 245 | |
632ee200 | 246 | /* |
621934ee PM |
247 | * Removes the count for the old reader from the appropriate per-CPU |
248 | * element of the srcu_struct. Note that this may well be a different | |
249 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | |
250 | * Must be called from process context. | |
251 | */ | |
632ee200 | 252 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
621934ee PM |
253 | { |
254 | preempt_disable(); | |
cef50120 | 255 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
440253c1 | 256 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
621934ee PM |
257 | preempt_enable(); |
258 | } | |
632ee200 | 259 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
621934ee | 260 | |
c072a388 PM |
261 | /* |
262 | * We use an adaptive strategy for synchronize_srcu() and especially for | |
263 | * synchronize_srcu_expedited(). We spin for a fixed time period | |
264 | * (defined below) to allow SRCU readers to exit their read-side critical | |
265 | * sections. If there are still some readers after 10 microseconds, | |
266 | * we repeatedly block for 1-millisecond time periods. This approach | |
267 | * has done well in testing, so there is no need for a config parameter. | |
268 | */ | |
d9792edd LJ |
269 | #define SYNCHRONIZE_SRCU_READER_DELAY 5 |
270 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | |
271 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | |
cef50120 | 272 | |
18108ebf LJ |
273 | /* |
274 | * Wait until all pre-existing readers complete. Such readers | |
275 | * will have used the index specified by "idx". | |
276 | */ | |
d9792edd | 277 | static void wait_idx(struct srcu_struct *sp, int idx, int trycount) |
cef50120 | 278 | { |
cef50120 PM |
279 | /* |
280 | * SRCU read-side critical sections are normally short, so wait | |
281 | * a small amount of time before possibly blocking. | |
282 | */ | |
283 | if (!srcu_readers_active_idx_check(sp, idx)) { | |
284 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | |
285 | while (!srcu_readers_active_idx_check(sp, idx)) { | |
d9792edd LJ |
286 | if (trycount > 0) { |
287 | trycount--; | |
cef50120 | 288 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); |
d9792edd | 289 | } else |
cef50120 PM |
290 | schedule_timeout_interruptible(1); |
291 | } | |
292 | } | |
cef50120 | 293 | } |
c072a388 | 294 | |
18108ebf | 295 | static void srcu_flip(struct srcu_struct *sp) |
944ce9af | 296 | { |
18108ebf | 297 | sp->completed++; |
944ce9af LJ |
298 | } |
299 | ||
0cd397d3 PM |
300 | /* |
301 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | |
621934ee | 302 | */ |
d9792edd | 303 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
621934ee | 304 | { |
18108ebf LJ |
305 | int busy_idx; |
306 | ||
fe15d706 PM |
307 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
308 | !lock_is_held(&rcu_bh_lock_map) && | |
309 | !lock_is_held(&rcu_lock_map) && | |
310 | !lock_is_held(&rcu_sched_lock_map), | |
311 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | |
312 | ||
621934ee | 313 | mutex_lock(&sp->mutex); |
18108ebf | 314 | busy_idx = sp->completed & 0X1UL; |
621934ee | 315 | |
621934ee | 316 | /* |
18108ebf LJ |
317 | * If we recently flipped the index, there will be some readers |
318 | * using idx=0 and others using idx=1. Therefore, two calls to | |
319 | * wait_idx()s suffice to ensure that all pre-existing readers | |
320 | * have completed: | |
321 | * | |
322 | * __synchronize_srcu() { | |
d9792edd LJ |
323 | * wait_idx(sp, 0, trycount); |
324 | * wait_idx(sp, 1, trycount); | |
18108ebf LJ |
325 | * } |
326 | * | |
327 | * Starvation is prevented by the fact that we flip the index. | |
328 | * While we wait on one index to clear out, almost all new readers | |
329 | * will be using the other index. The number of new readers using the | |
330 | * index we are waiting on is sharply bounded by roughly the number | |
331 | * of CPUs. | |
332 | * | |
333 | * How can new readers possibly using the old pre-flip value of | |
334 | * the index? Consider the following sequence of events: | |
335 | * | |
944ce9af LJ |
336 | * Suppose that during the previous grace period, a reader |
337 | * picked up the old value of the index, but did not increment | |
338 | * its counter until after the previous instance of | |
339 | * __synchronize_srcu() did the counter summation and recheck. | |
340 | * That previous grace period was OK because the reader did | |
341 | * not start until after the grace period started, so the grace | |
342 | * period was not obligated to wait for that reader. | |
343 | * | |
18108ebf LJ |
344 | * However, this sequence of events is quite improbable, so |
345 | * this call to wait_idx(), which waits on really old readers | |
346 | * describe in this comment above, will almost never need to wait. | |
621934ee | 347 | */ |
d9792edd | 348 | wait_idx(sp, 1 - busy_idx, trycount); |
944ce9af | 349 | |
18108ebf LJ |
350 | /* Flip the index to avoid reader-induced starvation. */ |
351 | srcu_flip(sp); | |
352 | ||
353 | /* Wait for recent pre-existing readers. */ | |
d9792edd | 354 | wait_idx(sp, busy_idx, trycount); |
944ce9af | 355 | |
621934ee PM |
356 | mutex_unlock(&sp->mutex); |
357 | } | |
358 | ||
0cd397d3 PM |
359 | /** |
360 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | |
361 | * @sp: srcu_struct with which to synchronize. | |
362 | * | |
363 | * Flip the completed counter, and wait for the old count to drain to zero. | |
364 | * As with classic RCU, the updater must use some separate means of | |
365 | * synchronizing concurrent updates. Can block; must be called from | |
366 | * process context. | |
367 | * | |
368 | * Note that it is illegal to call synchronize_srcu() from the corresponding | |
369 | * SRCU read-side critical section; doing so will result in deadlock. | |
370 | * However, it is perfectly legal to call synchronize_srcu() on one | |
371 | * srcu_struct from some other srcu_struct's read-side critical section. | |
372 | */ | |
373 | void synchronize_srcu(struct srcu_struct *sp) | |
374 | { | |
d9792edd | 375 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
0cd397d3 PM |
376 | } |
377 | EXPORT_SYMBOL_GPL(synchronize_srcu); | |
378 | ||
379 | /** | |
236fefaf | 380 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
0cd397d3 PM |
381 | * @sp: srcu_struct with which to synchronize. |
382 | * | |
cef50120 PM |
383 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
384 | * spinning rather than blocking when waiting. | |
0cd397d3 | 385 | * |
236fefaf | 386 | * Note that it is illegal to call this function while holding any lock |
cef50120 | 387 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
236fefaf PM |
388 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
389 | * critical section; doing so will result in deadlock. However, it is | |
390 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | |
391 | * from some other srcu_struct's read-side critical section, as long as | |
392 | * the resulting graph of srcu_structs is acyclic. | |
0cd397d3 PM |
393 | */ |
394 | void synchronize_srcu_expedited(struct srcu_struct *sp) | |
395 | { | |
d9792edd | 396 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
0cd397d3 PM |
397 | } |
398 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | |
399 | ||
621934ee PM |
400 | /** |
401 | * srcu_batches_completed - return batches completed. | |
402 | * @sp: srcu_struct on which to report batch completion. | |
403 | * | |
404 | * Report the number of batches, correlated with, but not necessarily | |
405 | * precisely the same as, the number of grace periods that have elapsed. | |
406 | */ | |
407 | ||
408 | long srcu_batches_completed(struct srcu_struct *sp) | |
409 | { | |
410 | return sp->completed; | |
411 | } | |
621934ee | 412 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |