]>
Commit | Line | Data |
---|---|---|
acddc0ed | 1 | // SPDX-License-Identifier: ISC |
3e41733f DL |
2 | /* |
3 | * Copyright (c) 2017-19 David Lamparter, for NetDEF, Inc. | |
3e41733f DL |
4 | */ |
5 | ||
6 | /* implementation notes: this is an epoch-based RCU implementation. rcu_seq | |
7 | * (global variable) counts the current epoch. Threads hold a specific epoch | |
8 | * in rcu_read_lock(). This is the oldest epoch a thread might be accessing | |
9 | * data from. | |
10 | * | |
11 | * The rcu_seq global is only pushed forward on rcu_read_lock() and | |
12 | * rcu_read_unlock() calls. This makes things a tad more efficient since | |
13 | * those are the only places it matters: | |
14 | * - on rcu_read_lock, we don't want to hold an old epoch pointlessly | |
15 | * - on rcu_read_unlock, we want to make sure we're not stuck on an old epoch | |
16 | * when heading into a long idle period where no thread holds RCU | |
17 | * | |
18 | * rcu_thread structures themselves are RCU-free'd. | |
19 | * | |
20 | * rcu_head structures are the most iffy; normally for an ATOMLIST we would | |
21 | * need to make sure we use rcu_free or pthread_rwlock to deallocate old items | |
22 | * to prevent ABA or use-after-free problems. However, our ATOMLIST code | |
23 | * guarantees that if the list remains non-empty in all cases, we only need | |
24 | * the "last" pointer to do an "add_tail()", i.e. we can't run into ABA/UAF | |
25 | * issues - but we do need to keep at least 1 item on the list. | |
26 | * | |
27 | * (Search the atomlist code for all uses of "last") | |
28 | */ | |
29 | ||
30 | #ifdef HAVE_CONFIG_H | |
31 | #include "config.h" | |
32 | #endif | |
33 | ||
34 | #include <pthread.h> | |
35 | #ifdef HAVE_PTHREAD_NP_H | |
36 | #include <pthread_np.h> | |
37 | #endif | |
38 | #include <string.h> | |
39 | #include <unistd.h> | |
40 | #include <signal.h> | |
41 | ||
42 | #include "frrcu.h" | |
43 | #include "seqlock.h" | |
44 | #include "atomlist.h" | |
45 | ||
bf8d3d6a DL |
46 | DEFINE_MTYPE_STATIC(LIB, RCU_THREAD, "RCU thread"); |
47 | DEFINE_MTYPE_STATIC(LIB, RCU_NEXT, "RCU sequence barrier"); | |
3e41733f | 48 | |
960b9a53 | 49 | DECLARE_ATOMLIST(rcu_heads, struct rcu_head, head); |
3e41733f | 50 | |
960b9a53 | 51 | PREDECL_ATOMLIST(rcu_threads); |
3e41733f DL |
52 | struct rcu_thread { |
53 | struct rcu_threads_item head; | |
54 | ||
55 | struct rcu_head rcu_head; | |
56 | ||
57 | struct seqlock rcu; | |
58 | ||
59 | /* only accessed by thread itself, not atomic */ | |
60 | unsigned depth; | |
61 | }; | |
960b9a53 | 62 | DECLARE_ATOMLIST(rcu_threads, struct rcu_thread, head); |
3e41733f DL |
63 | |
64 | static const struct rcu_action rcua_next = { .type = RCUA_NEXT }; | |
65 | static const struct rcu_action rcua_end = { .type = RCUA_END }; | |
66 | static const struct rcu_action rcua_close = { .type = RCUA_CLOSE }; | |
67 | ||
68 | struct rcu_next { | |
69 | struct rcu_head head_free; | |
70 | struct rcu_head head_next; | |
71 | }; | |
72 | ||
73 | #define rcu_free_internal(mtype, ptr, field) \ | |
74 | do { \ | |
75 | typeof(ptr) _ptr = (ptr); \ | |
76 | struct rcu_head *_rcu_head = &_ptr->field; \ | |
77 | static const struct rcu_action _rcu_action = { \ | |
78 | .type = RCUA_FREE, \ | |
79 | .u.free = { \ | |
80 | .mt = mtype, \ | |
81 | .offset = offsetof(typeof(*_ptr), field), \ | |
82 | }, \ | |
83 | }; \ | |
84 | _rcu_head->action = &_rcu_action; \ | |
85 | rcu_heads_add_tail(&rcu_heads, _rcu_head); \ | |
86 | } while (0) | |
87 | ||
88 | /* primary global RCU position */ | |
89 | static struct seqlock rcu_seq; | |
90 | /* this is set to rcu_seq whenever something is added on the RCU queue. | |
91 | * rcu_read_lock() and rcu_read_unlock() will then bump rcu_seq up one step. | |
92 | */ | |
93 | static _Atomic seqlock_val_t rcu_dirty; | |
94 | ||
95 | static struct rcu_threads_head rcu_threads; | |
96 | static struct rcu_heads_head rcu_heads; | |
97 | ||
98 | /* main thread & RCU sweeper have pre-setup rcu_thread structures. The | |
99 | * reasons are different: | |
100 | * | |
101 | * - rcu_thread_main is there because the main thread isn't started like | |
102 | * other threads, it's implicitly created when the program is started. So | |
103 | * rcu_thread_main matches up implicitly. | |
104 | * | |
105 | * - rcu_thread_rcu isn't actually put on the rcu_threads list (makes no | |
106 | * sense really), it only exists so we can call RCU-using functions from | |
107 | * the RCU thread without special handling in rcu_read_lock/unlock. | |
108 | */ | |
109 | static struct rcu_thread rcu_thread_main; | |
110 | static struct rcu_thread rcu_thread_rcu; | |
111 | ||
112 | static pthread_t rcu_pthread; | |
113 | static pthread_key_t rcu_thread_key; | |
114 | static bool rcu_active; | |
115 | ||
116 | static void rcu_start(void); | |
117 | static void rcu_bump(void); | |
118 | ||
119 | /* | |
120 | * preinitialization for main thread | |
121 | */ | |
122 | static void rcu_thread_end(void *rcu_thread); | |
123 | ||
124 | static void rcu_preinit(void) __attribute__((constructor)); | |
125 | static void rcu_preinit(void) | |
126 | { | |
127 | struct rcu_thread *rt; | |
128 | ||
129 | rt = &rcu_thread_main; | |
130 | rt->depth = 1; | |
131 | seqlock_init(&rt->rcu); | |
132 | seqlock_acquire_val(&rt->rcu, SEQLOCK_STARTVAL); | |
133 | ||
134 | pthread_key_create(&rcu_thread_key, rcu_thread_end); | |
135 | pthread_setspecific(rcu_thread_key, rt); | |
136 | ||
137 | rcu_threads_add_tail(&rcu_threads, rt); | |
138 | ||
139 | /* RCU sweeper's rcu_thread is a dummy, NOT added to rcu_threads */ | |
140 | rt = &rcu_thread_rcu; | |
141 | rt->depth = 1; | |
142 | ||
143 | seqlock_init(&rcu_seq); | |
144 | seqlock_acquire_val(&rcu_seq, SEQLOCK_STARTVAL); | |
145 | } | |
146 | ||
147 | static struct rcu_thread *rcu_self(void) | |
148 | { | |
149 | return (struct rcu_thread *)pthread_getspecific(rcu_thread_key); | |
150 | } | |
151 | ||
152 | /* | |
153 | * thread management (for the non-main thread) | |
154 | */ | |
155 | struct rcu_thread *rcu_thread_prepare(void) | |
156 | { | |
157 | struct rcu_thread *rt, *cur; | |
158 | ||
159 | rcu_assert_read_locked(); | |
160 | ||
161 | if (!rcu_active) | |
162 | rcu_start(); | |
163 | ||
164 | cur = rcu_self(); | |
165 | assert(cur->depth); | |
166 | ||
167 | /* new thread always starts with rcu_read_lock held at depth 1, and | |
168 | * holding the same epoch as the parent (this makes it possible to | |
169 | * use RCU for things passed into the thread through its arg) | |
170 | */ | |
171 | rt = XCALLOC(MTYPE_RCU_THREAD, sizeof(*rt)); | |
172 | rt->depth = 1; | |
173 | ||
174 | seqlock_init(&rt->rcu); | |
175 | seqlock_acquire(&rt->rcu, &cur->rcu); | |
176 | ||
177 | rcu_threads_add_tail(&rcu_threads, rt); | |
178 | ||
179 | return rt; | |
180 | } | |
181 | ||
182 | void rcu_thread_start(struct rcu_thread *rt) | |
183 | { | |
184 | pthread_setspecific(rcu_thread_key, rt); | |
185 | } | |
186 | ||
187 | void rcu_thread_unprepare(struct rcu_thread *rt) | |
188 | { | |
189 | if (rt == &rcu_thread_rcu) | |
190 | return; | |
191 | ||
192 | rt->depth = 1; | |
193 | seqlock_acquire(&rt->rcu, &rcu_seq); | |
194 | ||
195 | rcu_bump(); | |
196 | if (rt != &rcu_thread_main) | |
197 | /* this free() happens after seqlock_release() below */ | |
33de8d1d | 198 | rcu_free_internal(MTYPE_RCU_THREAD, rt, rcu_head); |
3e41733f DL |
199 | |
200 | rcu_threads_del(&rcu_threads, rt); | |
201 | seqlock_release(&rt->rcu); | |
202 | } | |
203 | ||
204 | static void rcu_thread_end(void *rtvoid) | |
205 | { | |
206 | struct rcu_thread *rt = rtvoid; | |
207 | rcu_thread_unprepare(rt); | |
208 | } | |
209 | ||
210 | /* | |
211 | * main RCU control aspects | |
212 | */ | |
213 | ||
214 | static void rcu_bump(void) | |
215 | { | |
216 | struct rcu_next *rn; | |
217 | ||
a72a3fa6 | 218 | rn = XMALLOC(MTYPE_RCU_NEXT, sizeof(*rn)); |
3e41733f DL |
219 | |
220 | /* note: each RCUA_NEXT item corresponds to exactly one seqno bump. | |
221 | * This means we don't need to communicate which seqno is which | |
222 | * RCUA_NEXT, since we really don't care. | |
223 | */ | |
224 | ||
225 | /* | |
226 | * Important race condition: while rcu_heads_add_tail is executing, | |
227 | * there is an intermediate point where the rcu_heads "last" pointer | |
228 | * already points to rn->head_next, but rn->head_next isn't added to | |
229 | * the list yet. That means any other "add_tail" calls append to this | |
230 | * item, which isn't fully on the list yet. Freeze this thread at | |
231 | * that point and look at another thread doing a rcu_bump. It adds | |
232 | * these two items and then does a seqlock_bump. But the rcu_heads | |
233 | * list is still "interrupted" and there's no RCUA_NEXT on the list | |
234 | * yet (from either the frozen thread or the second thread). So | |
235 | * rcu_main() might actually hit the end of the list at the | |
236 | * "interrupt". | |
237 | * | |
238 | * This situation is prevented by requiring that rcu_read_lock is held | |
239 | * for any calls to rcu_bump, since if we're holding the current RCU | |
240 | * epoch, that means rcu_main can't be chewing on rcu_heads and hit | |
241 | * that interruption point. Only by the time the thread has continued | |
242 | * to rcu_read_unlock() - and therefore completed the add_tail - the | |
243 | * RCU sweeper gobbles up the epoch and can be sure to find at least | |
244 | * the RCUA_NEXT and RCUA_FREE items on rcu_heads. | |
245 | */ | |
246 | rn->head_next.action = &rcua_next; | |
247 | rcu_heads_add_tail(&rcu_heads, &rn->head_next); | |
248 | ||
249 | /* free rn that we allocated above. | |
250 | * | |
251 | * This is INTENTIONALLY not built into the RCUA_NEXT action. This | |
252 | * ensures that after the action above is popped off the queue, there | |
253 | * is still at least 1 item on the RCU queue. This means we never | |
254 | * delete the last item, which is extremely important since it keeps | |
255 | * the atomlist ->last pointer alive and well. | |
256 | * | |
257 | * If we were to "run dry" on the RCU queue, add_tail may run into the | |
258 | * "last item is being deleted - start over" case, and then we may end | |
259 | * up accessing old RCU queue items that are already free'd. | |
260 | */ | |
33de8d1d | 261 | rcu_free_internal(MTYPE_RCU_NEXT, rn, head_free); |
3e41733f DL |
262 | |
263 | /* Only allow the RCU sweeper to run after these 2 items are queued. | |
264 | * | |
265 | * If another thread enqueues some RCU action in the intermediate | |
266 | * window here, nothing bad happens - the queued action is associated | |
267 | * with a larger seq# than strictly necessary. Thus, it might get | |
268 | * executed a bit later, but that's not a problem. | |
269 | * | |
270 | * If another thread acquires the read lock in this window, it holds | |
271 | * the previous epoch, but its RCU queue actions will be in the next | |
272 | * epoch. This isn't a problem either, just a tad inefficient. | |
273 | */ | |
274 | seqlock_bump(&rcu_seq); | |
275 | } | |
276 | ||
277 | static void rcu_bump_maybe(void) | |
278 | { | |
279 | seqlock_val_t dirty; | |
280 | ||
281 | dirty = atomic_load_explicit(&rcu_dirty, memory_order_relaxed); | |
282 | /* no problem if we race here and multiple threads bump rcu_seq; | |
283 | * bumping too much causes no issues while not bumping enough will | |
284 | * result in delayed cleanup | |
285 | */ | |
286 | if (dirty == seqlock_cur(&rcu_seq)) | |
287 | rcu_bump(); | |
288 | } | |
289 | ||
290 | void rcu_read_lock(void) | |
291 | { | |
292 | struct rcu_thread *rt = rcu_self(); | |
293 | ||
294 | assert(rt); | |
295 | if (rt->depth++ > 0) | |
296 | return; | |
297 | ||
298 | seqlock_acquire(&rt->rcu, &rcu_seq); | |
299 | /* need to hold RCU for bump ... */ | |
300 | rcu_bump_maybe(); | |
301 | /* ... but no point in holding the old epoch if we just bumped */ | |
302 | seqlock_acquire(&rt->rcu, &rcu_seq); | |
303 | } | |
304 | ||
305 | void rcu_read_unlock(void) | |
306 | { | |
307 | struct rcu_thread *rt = rcu_self(); | |
308 | ||
309 | assert(rt && rt->depth); | |
310 | if (--rt->depth > 0) | |
311 | return; | |
312 | rcu_bump_maybe(); | |
313 | seqlock_release(&rt->rcu); | |
314 | } | |
315 | ||
316 | void rcu_assert_read_locked(void) | |
317 | { | |
318 | struct rcu_thread *rt = rcu_self(); | |
319 | assert(rt && rt->depth && seqlock_held(&rt->rcu)); | |
320 | } | |
321 | ||
322 | void rcu_assert_read_unlocked(void) | |
323 | { | |
324 | struct rcu_thread *rt = rcu_self(); | |
325 | assert(rt && !rt->depth && !seqlock_held(&rt->rcu)); | |
326 | } | |
327 | ||
328 | /* | |
329 | * RCU resource-release thread | |
330 | */ | |
331 | ||
332 | static void *rcu_main(void *arg); | |
333 | ||
334 | static void rcu_start(void) | |
335 | { | |
336 | /* ensure we never handle signals on the RCU thread by blocking | |
337 | * everything here (new thread inherits signal mask) | |
338 | */ | |
339 | sigset_t oldsigs, blocksigs; | |
340 | ||
341 | sigfillset(&blocksigs); | |
342 | pthread_sigmask(SIG_BLOCK, &blocksigs, &oldsigs); | |
343 | ||
344 | rcu_active = true; | |
345 | ||
346 | assert(!pthread_create(&rcu_pthread, NULL, rcu_main, NULL)); | |
347 | ||
348 | pthread_sigmask(SIG_SETMASK, &oldsigs, NULL); | |
349 | ||
350 | #ifdef HAVE_PTHREAD_SETNAME_NP | |
351 | # ifdef GNU_LINUX | |
352 | pthread_setname_np(rcu_pthread, "RCU sweeper"); | |
353 | # elif defined(__NetBSD__) | |
354 | pthread_setname_np(rcu_pthread, "RCU sweeper", NULL); | |
355 | # endif | |
356 | #elif defined(HAVE_PTHREAD_SET_NAME_NP) | |
357 | pthread_set_name_np(rcu_pthread, "RCU sweeper"); | |
358 | #endif | |
359 | } | |
360 | ||
361 | static void rcu_do(struct rcu_head *rh) | |
362 | { | |
363 | struct rcu_head_close *rhc; | |
364 | void *p; | |
365 | ||
366 | switch (rh->action->type) { | |
367 | case RCUA_FREE: | |
368 | p = (char *)rh - rh->action->u.free.offset; | |
369 | if (rh->action->u.free.mt) | |
370 | qfree(rh->action->u.free.mt, p); | |
371 | else | |
372 | free(p); | |
373 | break; | |
374 | case RCUA_CLOSE: | |
375 | rhc = container_of(rh, struct rcu_head_close, | |
376 | rcu_head); | |
377 | close(rhc->fd); | |
378 | break; | |
379 | case RCUA_CALL: | |
380 | p = (char *)rh - rh->action->u.call.offset; | |
381 | rh->action->u.call.fptr(p); | |
382 | break; | |
383 | ||
384 | case RCUA_INVALID: | |
385 | case RCUA_NEXT: | |
386 | case RCUA_END: | |
387 | default: | |
388 | assert(0); | |
389 | } | |
390 | } | |
391 | ||
392 | static void rcu_watchdog(struct rcu_thread *rt) | |
393 | { | |
394 | #if 0 | |
395 | /* future work: print a backtrace for the thread that's holding up | |
396 | * RCU. The only (good) way of doing that is to send a signal to the | |
397 | * other thread, save away the backtrace in the signal handler, and | |
398 | * block here until the signal is done processing. | |
399 | * | |
400 | * Just haven't implemented that yet. | |
401 | */ | |
402 | fprintf(stderr, "RCU watchdog %p\n", rt); | |
403 | #endif | |
404 | } | |
405 | ||
406 | static void *rcu_main(void *arg) | |
407 | { | |
408 | struct rcu_thread *rt; | |
409 | struct rcu_head *rh = NULL; | |
410 | bool end = false; | |
411 | struct timespec maxwait; | |
412 | ||
413 | seqlock_val_t rcuval = SEQLOCK_STARTVAL; | |
414 | ||
415 | pthread_setspecific(rcu_thread_key, &rcu_thread_rcu); | |
416 | ||
417 | while (!end) { | |
418 | seqlock_wait(&rcu_seq, rcuval); | |
419 | ||
420 | /* RCU watchdog timeout, TODO: configurable value */ | |
421 | clock_gettime(CLOCK_MONOTONIC, &maxwait); | |
422 | maxwait.tv_nsec += 100 * 1000 * 1000; | |
423 | if (maxwait.tv_nsec >= 1000000000) { | |
424 | maxwait.tv_sec++; | |
425 | maxwait.tv_nsec -= 1000000000; | |
426 | } | |
427 | ||
428 | frr_each (rcu_threads, &rcu_threads, rt) | |
429 | if (!seqlock_timedwait(&rt->rcu, rcuval, &maxwait)) { | |
430 | rcu_watchdog(rt); | |
431 | seqlock_wait(&rt->rcu, rcuval); | |
432 | } | |
433 | ||
434 | while ((rh = rcu_heads_pop(&rcu_heads))) { | |
435 | if (rh->action->type == RCUA_NEXT) | |
436 | break; | |
437 | else if (rh->action->type == RCUA_END) | |
438 | end = true; | |
439 | else | |
440 | rcu_do(rh); | |
441 | } | |
442 | ||
443 | rcuval += SEQLOCK_INCR; | |
444 | } | |
445 | ||
446 | /* rcu_shutdown can only be called singlethreaded, and it does a | |
447 | * pthread_join, so it should be impossible that anything ended up | |
448 | * on the queue after RCUA_END | |
449 | */ | |
450 | #if 1 | |
451 | assert(!rcu_heads_first(&rcu_heads)); | |
452 | #else | |
453 | while ((rh = rcu_heads_pop(&rcu_heads))) | |
454 | if (rh->action->type >= RCUA_FREE) | |
455 | rcu_do(rh); | |
456 | #endif | |
457 | return NULL; | |
458 | } | |
459 | ||
460 | void rcu_shutdown(void) | |
461 | { | |
462 | static struct rcu_head rcu_head_end; | |
463 | struct rcu_thread *rt = rcu_self(); | |
464 | void *retval; | |
465 | ||
466 | if (!rcu_active) | |
467 | return; | |
468 | ||
469 | rcu_assert_read_locked(); | |
470 | assert(rcu_threads_count(&rcu_threads) == 1); | |
471 | ||
472 | rcu_enqueue(&rcu_head_end, &rcua_end); | |
473 | ||
474 | rt->depth = 0; | |
475 | seqlock_release(&rt->rcu); | |
476 | seqlock_release(&rcu_seq); | |
477 | rcu_active = false; | |
478 | ||
479 | /* clearing rcu_active is before pthread_join in case we hang in | |
480 | * pthread_join & get a SIGTERM or something - in that case, just | |
481 | * ignore the maybe-still-running RCU thread | |
482 | */ | |
483 | if (pthread_join(rcu_pthread, &retval) == 0) { | |
484 | seqlock_acquire_val(&rcu_seq, SEQLOCK_STARTVAL); | |
485 | seqlock_acquire_val(&rt->rcu, SEQLOCK_STARTVAL); | |
486 | rt->depth = 1; | |
487 | } | |
488 | } | |
489 | ||
490 | /* | |
491 | * RCU'd free functions | |
492 | */ | |
493 | ||
494 | void rcu_enqueue(struct rcu_head *rh, const struct rcu_action *action) | |
495 | { | |
496 | /* refer to rcu_bump() for why we need to hold RCU when adding items | |
497 | * to rcu_heads | |
498 | */ | |
499 | rcu_assert_read_locked(); | |
500 | ||
501 | rh->action = action; | |
502 | ||
503 | if (!rcu_active) { | |
504 | rcu_do(rh); | |
505 | return; | |
506 | } | |
507 | rcu_heads_add_tail(&rcu_heads, rh); | |
508 | atomic_store_explicit(&rcu_dirty, seqlock_cur(&rcu_seq), | |
509 | memory_order_relaxed); | |
510 | } | |
511 | ||
512 | void rcu_close(struct rcu_head_close *rhc, int fd) | |
513 | { | |
514 | rhc->fd = fd; | |
515 | rcu_enqueue(&rhc->rcu_head, &rcua_close); | |
516 | } |