]>
Commit | Line | Data |
---|---|---|
a4b75251 TL |
1 | // SPDX-License-Identifier: BSD-3-Clause |
2 | /* Copyright 2018-2019, Intel Corporation */ | |
3 | ||
4 | /* | |
5 | * critnib.c -- implementation of critnib tree | |
6 | * | |
7 | * It offers identity lookup (like a hashmap) and <= lookup (like a search | |
8 | * tree). Unlike some hashing algorithms (cuckoo hash, perfect hashing) the | |
9 | * complexity isn't constant, but for data sizes we expect it's several | |
10 | * times as fast as cuckoo, and has no "stop the world" cases that would | |
11 | * cause latency (ie, better worst case behaviour). | |
12 | */ | |
13 | ||
14 | /* | |
15 | * STRUCTURE DESCRIPTION | |
16 | * | |
17 | * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit: | |
18 | * it skips nodes for uninteresting radix nodes (ie, ones that would have | |
19 | * exactly one child), this requires adding to every node a field that | |
20 | * describes the slice (4-bit in our case) that this radix level is for. | |
21 | * | |
22 | * This implementation also stores each node's path (ie, bits that are | |
23 | * common to every key in that subtree) -- this doesn't help with lookups | |
24 | * at all (unused in == match, could be reconstructed at no cost in <= | |
25 | * after first dive) but simplifies inserts and removes. If we ever want | |
26 | * that piece of memory it's easy to trim it down. | |
27 | */ | |
28 | ||
29 | /* | |
30 | * CONCURRENCY ISSUES | |
31 | * | |
32 | * Reads are completely lock-free sync-free, but only almost wait-free: | |
33 | * if for some reason a read thread gets pathologically stalled, it will | |
34 | * notice the data being stale and restart the work. In usual cases, | |
35 | * the structure having been modified does _not_ cause a restart. | |
36 | * | |
37 | * Writes could be easily made lock-free as well (with only a cmpxchg | |
38 | * sync), but this leads to problems with removes. A possible solution | |
39 | * would be doing removes by overwriting by NULL w/o freeing -- yet this | |
40 | * would lead to the structure growing without bounds. Complex per-node | |
41 | * locks would increase concurrency but they slow down individual writes | |
42 | * enough that in practice a simple global write lock works faster. | |
43 | * | |
44 | * Removes are the only operation that can break reads. The structure | |
45 | * can do local RCU well -- the problem being knowing when it's safe to | |
46 | * free. Any synchronization with reads would kill their speed, thus | |
47 | * instead we have a remove count. The grace period is DELETED_LIFE, | |
48 | * after which any read will notice staleness and restart its work. | |
49 | */ | |
50 | #include <errno.h> | |
51 | #include <stdbool.h> | |
52 | ||
53 | #include "alloc.h" | |
54 | #include "critnib.h" | |
55 | #include "out.h" | |
56 | #include "sys_util.h" | |
57 | #include "valgrind_internal.h" | |
58 | ||
59 | /* | |
60 | * A node that has been deleted is left untouched for this many delete | |
61 | * cycles. Reads have guaranteed correctness if they took no longer than | |
62 | * DELETED_LIFE concurrent deletes, otherwise they notice something is | |
63 | * wrong and restart. The memory of deleted nodes is never freed to | |
64 | * malloc nor their pointers lead anywhere wrong, thus a stale read will | |
65 | * (temporarily) get a wrong answer but won't crash. | |
66 | * | |
67 | * There's no need to count writes as they never interfere with reads. | |
68 | * | |
69 | * Allowing stale reads (of arbitrarily old writes or of deletes less than | |
70 | * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect | |
71 | * semantics in any way: the thread could have been stalled just after | |
72 | * returning from our code. Thus, the guarantee is: the result of get() or | |
73 | * find_le() is a value that was current at any point between the call | |
74 | * start and end. | |
75 | */ | |
76 | #define DELETED_LIFE 16 | |
77 | ||
78 | #define SLICE 4 | |
79 | #define NIB ((1ULL << SLICE) - 1) | |
80 | #define SLNODES (1 << SLICE) | |
81 | ||
82 | typedef unsigned char sh_t; | |
83 | ||
84 | struct critnib_node { | |
85 | /* | |
86 | * path is the part of a tree that's already traversed (be it through | |
87 | * explicit nodes or collapsed links) -- ie, any subtree below has all | |
88 | * those bits set to this value. | |
89 | * | |
90 | * nib is a 4-bit slice that's an index into the node's children. | |
91 | * | |
92 | * shift is the length (in bits) of the part of the key below this node. | |
93 | * | |
94 | * nib | |
95 | * |XXXXXXXXXX|?|*****| | |
96 | * path ^ | |
97 | * +-----+ | |
98 | * shift | |
99 | */ | |
100 | struct critnib_node *child[SLNODES]; | |
101 | uint64_t path; | |
102 | sh_t shift; | |
103 | }; | |
104 | ||
105 | struct critnib_leaf { | |
106 | uint64_t key; | |
107 | void *value; | |
108 | }; | |
109 | ||
110 | struct critnib { | |
111 | struct critnib_node *root; | |
112 | ||
113 | /* pool of freed nodes: singly linked list, next at child[0] */ | |
114 | struct critnib_node *deleted_node; | |
115 | struct critnib_leaf *deleted_leaf; | |
116 | ||
117 | /* nodes removed but not yet eligible for reuse */ | |
118 | struct critnib_node *pending_del_nodes[DELETED_LIFE]; | |
119 | struct critnib_leaf *pending_del_leaves[DELETED_LIFE]; | |
120 | ||
121 | uint64_t remove_count; | |
122 | ||
123 | os_mutex_t mutex; /* writes/removes */ | |
124 | }; | |
125 | ||
126 | /* | |
127 | * atomic load | |
128 | */ | |
129 | static void | |
130 | load(void *src, void *dst) | |
131 | { | |
132 | util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst, | |
133 | memory_order_acquire); | |
134 | } | |
135 | ||
136 | /* | |
137 | * atomic store | |
138 | */ | |
139 | static void | |
140 | store(void *dst, void *src) | |
141 | { | |
142 | util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src, | |
143 | memory_order_release); | |
144 | } | |
145 | ||
146 | /* | |
147 | * internal: is_leaf -- check tagged pointer for leafness | |
148 | */ | |
149 | static inline bool | |
150 | is_leaf(struct critnib_node *n) | |
151 | { | |
152 | return (uint64_t)n & 1; | |
153 | } | |
154 | ||
155 | /* | |
156 | * internal: to_leaf -- untag a leaf pointer | |
157 | */ | |
158 | static inline struct critnib_leaf * | |
159 | to_leaf(struct critnib_node *n) | |
160 | { | |
161 | return (void *)((uint64_t)n & ~1ULL); | |
162 | } | |
163 | ||
164 | /* | |
165 | * internal: path_mask -- return bit mask of a path above a subtree [shift] | |
166 | * bits tall | |
167 | */ | |
168 | static inline uint64_t | |
169 | path_mask(sh_t shift) | |
170 | { | |
171 | return ~NIB << shift; | |
172 | } | |
173 | ||
174 | /* | |
175 | * internal: slice_index -- return index of child at the given nib | |
176 | */ | |
177 | static inline unsigned | |
178 | slice_index(uint64_t key, sh_t shift) | |
179 | { | |
180 | return (unsigned)((key >> shift) & NIB); | |
181 | } | |
182 | ||
183 | /* | |
184 | * critnib_new -- allocates a new critnib structure | |
185 | */ | |
186 | struct critnib * | |
187 | critnib_new(void) | |
188 | { | |
189 | struct critnib *c = Zalloc(sizeof(struct critnib)); | |
190 | if (!c) | |
191 | return NULL; | |
192 | ||
193 | util_mutex_init(&c->mutex); | |
194 | ||
195 | VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root)); | |
196 | VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count, | |
197 | sizeof(c->remove_count)); | |
198 | ||
199 | return c; | |
200 | } | |
201 | ||
202 | /* | |
203 | * internal: delete_node -- recursively free (to malloc) a subtree | |
204 | */ | |
205 | static void | |
206 | delete_node(struct critnib_node *__restrict n) | |
207 | { | |
208 | if (!is_leaf(n)) { | |
209 | for (int i = 0; i < SLNODES; i++) { | |
210 | if (n->child[i]) | |
211 | delete_node(n->child[i]); | |
212 | } | |
213 | ||
214 | Free(n); | |
215 | } else { | |
216 | Free(to_leaf(n)); | |
217 | } | |
218 | } | |
219 | ||
220 | /* | |
221 | * critnib_delete -- destroy and free a critnib struct | |
222 | */ | |
223 | void | |
224 | critnib_delete(struct critnib *c) | |
225 | { | |
226 | if (c->root) | |
227 | delete_node(c->root); | |
228 | ||
229 | util_mutex_destroy(&c->mutex); | |
230 | ||
231 | for (struct critnib_node *m = c->deleted_node; m; ) { | |
232 | struct critnib_node *mm = m->child[0]; | |
233 | Free(m); | |
234 | m = mm; | |
235 | } | |
236 | ||
237 | for (struct critnib_leaf *k = c->deleted_leaf; k; ) { | |
238 | struct critnib_leaf *kk = k->value; | |
239 | Free(k); | |
240 | k = kk; | |
241 | } | |
242 | ||
243 | for (int i = 0; i < DELETED_LIFE; i++) { | |
244 | Free(c->pending_del_nodes[i]); | |
245 | Free(c->pending_del_leaves[i]); | |
246 | } | |
247 | ||
248 | Free(c); | |
249 | } | |
250 | ||
251 | /* | |
252 | * internal: free_node -- free (to internal pool, not malloc) a node. | |
253 | * | |
254 | * We cannot free them to malloc as a stalled reader thread may still walk | |
255 | * through such nodes; it will notice the result being bogus but only after | |
256 | * completing the walk, thus we need to ensure any freed nodes still point | |
257 | * to within the critnib structure. | |
258 | */ | |
259 | static void | |
260 | free_node(struct critnib *__restrict c, struct critnib_node *__restrict n) | |
261 | { | |
262 | if (!n) | |
263 | return; | |
264 | ||
265 | ASSERT(!is_leaf(n)); | |
266 | n->child[0] = c->deleted_node; | |
267 | c->deleted_node = n; | |
268 | } | |
269 | ||
270 | /* | |
271 | * internal: alloc_node -- allocate a node from our pool or from malloc | |
272 | */ | |
273 | static struct critnib_node * | |
274 | alloc_node(struct critnib *__restrict c) | |
275 | { | |
276 | if (!c->deleted_node) { | |
277 | struct critnib_node *n = Malloc(sizeof(struct critnib_node)); | |
278 | if (n == NULL) | |
279 | ERR("!Malloc"); | |
280 | ||
281 | return n; | |
282 | } | |
283 | ||
284 | struct critnib_node *n = c->deleted_node; | |
285 | ||
286 | c->deleted_node = n->child[0]; | |
287 | VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n)); | |
288 | ||
289 | return n; | |
290 | } | |
291 | ||
292 | /* | |
293 | * internal: free_leaf -- free (to internal pool, not malloc) a leaf. | |
294 | * | |
295 | * See free_node(). | |
296 | */ | |
297 | static void | |
298 | free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k) | |
299 | { | |
300 | if (!k) | |
301 | return; | |
302 | ||
303 | k->value = c->deleted_leaf; | |
304 | c->deleted_leaf = k; | |
305 | } | |
306 | ||
307 | /* | |
308 | * internal: alloc_leaf -- allocate a leaf from our pool or from malloc | |
309 | */ | |
310 | static struct critnib_leaf * | |
311 | alloc_leaf(struct critnib *__restrict c) | |
312 | { | |
313 | if (!c->deleted_leaf) { | |
314 | struct critnib_leaf *k = Malloc(sizeof(struct critnib_leaf)); | |
315 | if (k == NULL) | |
316 | ERR("!Malloc"); | |
317 | ||
318 | return k; | |
319 | } | |
320 | ||
321 | struct critnib_leaf *k = c->deleted_leaf; | |
322 | ||
323 | c->deleted_leaf = k->value; | |
324 | VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k)); | |
325 | ||
326 | return k; | |
327 | } | |
328 | ||
329 | /* | |
330 | * crinib_insert -- write a key:value pair to the critnib structure | |
331 | * | |
332 | * Returns: | |
333 | * • 0 on success | |
334 | * • EEXIST if such a key already exists | |
335 | * • ENOMEM if we're out of memory | |
336 | * | |
337 | * Takes a global write lock but doesn't stall any readers. | |
338 | */ | |
339 | int | |
340 | critnib_insert(struct critnib *c, uint64_t key, void *value) | |
341 | { | |
342 | util_mutex_lock(&c->mutex); | |
343 | ||
344 | struct critnib_leaf *k = alloc_leaf(c); | |
345 | if (!k) { | |
346 | util_mutex_unlock(&c->mutex); | |
347 | ||
348 | return ENOMEM; | |
349 | } | |
350 | ||
351 | VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf)); | |
352 | ||
353 | k->key = key; | |
354 | k->value = value; | |
355 | ||
356 | struct critnib_node *kn = (void *)((uint64_t)k | 1); | |
357 | ||
358 | struct critnib_node *n = c->root; | |
359 | if (!n) { | |
360 | c->root = kn; | |
361 | ||
362 | util_mutex_unlock(&c->mutex); | |
363 | ||
364 | return 0; | |
365 | } | |
366 | ||
367 | struct critnib_node **parent = &c->root; | |
368 | struct critnib_node *prev = c->root; | |
369 | ||
370 | while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) { | |
371 | prev = n; | |
372 | parent = &n->child[slice_index(key, n->shift)]; | |
373 | n = *parent; | |
374 | } | |
375 | ||
376 | if (!n) { | |
377 | n = prev; | |
378 | store(&n->child[slice_index(key, n->shift)], kn); | |
379 | ||
380 | util_mutex_unlock(&c->mutex); | |
381 | ||
382 | return 0; | |
383 | } | |
384 | ||
385 | uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path; | |
386 | /* Find where the path differs from our key. */ | |
387 | uint64_t at = path ^ key; | |
388 | if (!at) { | |
389 | ASSERT(is_leaf(n)); | |
390 | free_leaf(c, to_leaf(kn)); | |
391 | /* fail instead of replacing */ | |
392 | ||
393 | util_mutex_unlock(&c->mutex); | |
394 | ||
395 | return EEXIST; | |
396 | } | |
397 | ||
398 | /* and convert that to an index. */ | |
399 | sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1); | |
400 | ||
401 | struct critnib_node *m = alloc_node(c); | |
402 | if (!m) { | |
403 | free_leaf(c, to_leaf(kn)); | |
404 | ||
405 | util_mutex_unlock(&c->mutex); | |
406 | ||
407 | return ENOMEM; | |
408 | } | |
409 | VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node)); | |
410 | ||
411 | for (int i = 0; i < SLNODES; i++) | |
412 | m->child[i] = NULL; | |
413 | ||
414 | m->child[slice_index(key, sh)] = kn; | |
415 | m->child[slice_index(path, sh)] = n; | |
416 | m->shift = sh; | |
417 | m->path = key & path_mask(sh); | |
418 | store(parent, m); | |
419 | ||
420 | util_mutex_unlock(&c->mutex); | |
421 | ||
422 | return 0; | |
423 | } | |
424 | ||
425 | /* | |
426 | * critnib_remove -- delete a key from the critnib structure, return its value | |
427 | */ | |
428 | void * | |
429 | critnib_remove(struct critnib *c, uint64_t key) | |
430 | { | |
431 | struct critnib_leaf *k; | |
432 | void *value = NULL; | |
433 | ||
434 | util_mutex_lock(&c->mutex); | |
435 | ||
436 | struct critnib_node *n = c->root; | |
437 | if (!n) | |
438 | goto not_found; | |
439 | ||
440 | uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE; | |
441 | free_node(c, c->pending_del_nodes[del]); | |
442 | free_leaf(c, c->pending_del_leaves[del]); | |
443 | c->pending_del_nodes[del] = NULL; | |
444 | c->pending_del_leaves[del] = NULL; | |
445 | ||
446 | if (is_leaf(n)) { | |
447 | k = to_leaf(n); | |
448 | if (k->key == key) { | |
449 | store(&c->root, NULL); | |
450 | goto del_leaf; | |
451 | } | |
452 | ||
453 | goto not_found; | |
454 | } | |
455 | /* | |
456 | * n and k are a parent:child pair (after the first iteration); k is the | |
457 | * leaf that holds the key we're deleting. | |
458 | */ | |
459 | struct critnib_node **k_parent = &c->root; | |
460 | struct critnib_node **n_parent = &c->root; | |
461 | struct critnib_node *kn = n; | |
462 | ||
463 | while (!is_leaf(kn)) { | |
464 | n_parent = k_parent; | |
465 | n = kn; | |
466 | k_parent = &kn->child[slice_index(key, kn->shift)]; | |
467 | kn = *k_parent; | |
468 | ||
469 | if (!kn) | |
470 | goto not_found; | |
471 | } | |
472 | ||
473 | k = to_leaf(kn); | |
474 | if (k->key != key) | |
475 | goto not_found; | |
476 | ||
477 | store(&n->child[slice_index(key, n->shift)], NULL); | |
478 | ||
479 | /* Remove the node if there's only one remaining child. */ | |
480 | int ochild = -1; | |
481 | for (int i = 0; i < SLNODES; i++) { | |
482 | if (n->child[i]) { | |
483 | if (ochild != -1) | |
484 | goto del_leaf; | |
485 | ||
486 | ochild = i; | |
487 | } | |
488 | } | |
489 | ||
490 | ASSERTne(ochild, -1); | |
491 | ||
492 | store(n_parent, n->child[ochild]); | |
493 | c->pending_del_nodes[del] = n; | |
494 | ||
495 | del_leaf: | |
496 | value = k->value; | |
497 | c->pending_del_leaves[del] = k; | |
498 | ||
499 | not_found: | |
500 | util_mutex_unlock(&c->mutex); | |
501 | return value; | |
502 | } | |
503 | ||
504 | /* | |
505 | * critnib_get -- query for a key ("==" match), returns value or NULL | |
506 | * | |
507 | * Doesn't need a lock but if many deletes happened while our thread was | |
508 | * somehow stalled the query is restarted (as freed nodes remain unused only | |
509 | * for a grace period). | |
510 | * | |
511 | * Counterintuitively, it's pointless to return the most current answer, | |
512 | * we need only one that was valid at any point after the call started. | |
513 | */ | |
514 | void * | |
515 | critnib_get(struct critnib *c, uint64_t key) | |
516 | { | |
517 | uint64_t wrs1, wrs2; | |
518 | void *res; | |
519 | ||
520 | do { | |
521 | struct critnib_node *n; | |
522 | ||
523 | load(&c->remove_count, &wrs1); | |
524 | load(&c->root, &n); | |
525 | ||
526 | /* | |
527 | * critbit algorithm: dive into the tree, looking at nothing but | |
528 | * each node's critical bit^H^H^Hnibble. This means we risk | |
529 | * going wrong way if our path is missing, but that's ok... | |
530 | */ | |
531 | while (n && !is_leaf(n)) | |
532 | load(&n->child[slice_index(key, n->shift)], &n); | |
533 | ||
534 | /* ... as we check it at the end. */ | |
535 | struct critnib_leaf *k = to_leaf(n); | |
536 | res = (n && k->key == key) ? k->value : NULL; | |
537 | load(&c->remove_count, &wrs2); | |
538 | } while (wrs1 + DELETED_LIFE <= wrs2); | |
539 | ||
540 | return res; | |
541 | } | |
542 | ||
543 | /* | |
544 | * internal: find_successor -- return the rightmost non-null node in a subtree | |
545 | */ | |
546 | static void * | |
547 | find_successor(struct critnib_node *__restrict n) | |
548 | { | |
549 | while (1) { | |
550 | int nib; | |
551 | for (nib = NIB; nib >= 0; nib--) | |
552 | if (n->child[nib]) | |
553 | break; | |
554 | ||
555 | if (nib < 0) | |
556 | return NULL; | |
557 | ||
558 | n = n->child[nib]; | |
559 | if (is_leaf(n)) | |
560 | return to_leaf(n)->value; | |
561 | } | |
562 | } | |
563 | ||
564 | /* | |
565 | * internal: find_le -- recursively search <= in a subtree | |
566 | */ | |
567 | static void * | |
568 | find_le(struct critnib_node *__restrict n, uint64_t key) | |
569 | { | |
570 | if (!n) | |
571 | return NULL; | |
572 | ||
573 | if (is_leaf(n)) { | |
574 | struct critnib_leaf *k = to_leaf(n); | |
575 | ||
576 | return (k->key <= key) ? k->value : NULL; | |
577 | } | |
578 | ||
579 | /* | |
580 | * is our key outside the subtree we're in? | |
581 | * | |
582 | * If we're inside, all bits above the nib will be identical; note | |
583 | * that shift points at the nib's lower rather than upper edge, so it | |
584 | * needs to be masked away as well. | |
585 | */ | |
586 | if ((key ^ n->path) >> (n->shift) & ~NIB) { | |
587 | /* | |
588 | * subtree is too far to the left? | |
589 | * -> its rightmost value is good | |
590 | */ | |
591 | if (n->path < key) | |
592 | return find_successor(n); | |
593 | ||
594 | /* | |
595 | * subtree is too far to the right? | |
596 | * -> it has nothing of interest to us | |
597 | */ | |
598 | return NULL; | |
599 | } | |
600 | ||
601 | unsigned nib = slice_index(key, n->shift); | |
602 | /* recursive call: follow the path */ | |
603 | { | |
604 | struct critnib_node *m; | |
605 | load(&n->child[nib], &m); | |
606 | void *value = find_le(m, key); | |
607 | if (value) | |
608 | return value; | |
609 | } | |
610 | ||
611 | /* | |
612 | * nothing in that subtree? We strayed from the path at this point, | |
613 | * thus need to search every subtree to our left in this node. No | |
614 | * need to dive into any but the first non-null, though. | |
615 | */ | |
616 | for (; nib > 0; nib--) { | |
617 | struct critnib_node *m; | |
618 | load(&n->child[nib - 1], &m); | |
619 | if (m) { | |
620 | n = m; | |
621 | if (is_leaf(n)) | |
622 | return to_leaf(n)->value; | |
623 | ||
624 | return find_successor(n); | |
625 | } | |
626 | } | |
627 | ||
628 | return NULL; | |
629 | } | |
630 | ||
631 | /* | |
632 | * critnib_find_le -- query for a key ("<=" match), returns value or NULL | |
633 | * | |
634 | * Same guarantees as critnib_get(). | |
635 | */ | |
636 | void * | |
637 | critnib_find_le(struct critnib *c, uint64_t key) | |
638 | { | |
639 | uint64_t wrs1, wrs2; | |
640 | void *res; | |
641 | ||
642 | do { | |
643 | load(&c->remove_count, &wrs1); | |
644 | struct critnib_node *n; /* avoid a subtle TOCTOU */ | |
645 | load(&c->root, &n); | |
646 | res = n ? find_le(n, key) : NULL; | |
647 | load(&c->remove_count, &wrs2); | |
648 | } while (wrs1 + DELETED_LIFE <= wrs2); | |
649 | ||
650 | return res; | |
651 | } |