]> git.proxmox.com Git - ceph.git/blame - ceph/src/pmdk/src/libpmemobj/critnib.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmemobj / critnib.c
CommitLineData
a4b75251
TL
1// SPDX-License-Identifier: BSD-3-Clause
2/* Copyright 2018-2019, Intel Corporation */
3
4/*
5 * critnib.c -- implementation of critnib tree
6 *
7 * It offers identity lookup (like a hashmap) and <= lookup (like a search
8 * tree). Unlike some hashing algorithms (cuckoo hash, perfect hashing) the
9 * complexity isn't constant, but for data sizes we expect it's several
10 * times as fast as cuckoo, and has no "stop the world" cases that would
11 * cause latency (ie, better worst case behaviour).
12 */
13
14/*
15 * STRUCTURE DESCRIPTION
16 *
17 * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit:
18 * it skips nodes for uninteresting radix nodes (ie, ones that would have
19 * exactly one child), this requires adding to every node a field that
20 * describes the slice (4-bit in our case) that this radix level is for.
21 *
22 * This implementation also stores each node's path (ie, bits that are
23 * common to every key in that subtree) -- this doesn't help with lookups
24 * at all (unused in == match, could be reconstructed at no cost in <=
25 * after first dive) but simplifies inserts and removes. If we ever want
26 * that piece of memory it's easy to trim it down.
27 */
28
29/*
30 * CONCURRENCY ISSUES
31 *
32 * Reads are completely lock-free sync-free, but only almost wait-free:
33 * if for some reason a read thread gets pathologically stalled, it will
34 * notice the data being stale and restart the work. In usual cases,
35 * the structure having been modified does _not_ cause a restart.
36 *
37 * Writes could be easily made lock-free as well (with only a cmpxchg
38 * sync), but this leads to problems with removes. A possible solution
39 * would be doing removes by overwriting by NULL w/o freeing -- yet this
40 * would lead to the structure growing without bounds. Complex per-node
41 * locks would increase concurrency but they slow down individual writes
42 * enough that in practice a simple global write lock works faster.
43 *
44 * Removes are the only operation that can break reads. The structure
45 * can do local RCU well -- the problem being knowing when it's safe to
46 * free. Any synchronization with reads would kill their speed, thus
47 * instead we have a remove count. The grace period is DELETED_LIFE,
48 * after which any read will notice staleness and restart its work.
49 */
50#include <errno.h>
51#include <stdbool.h>
52
53#include "alloc.h"
54#include "critnib.h"
55#include "out.h"
56#include "sys_util.h"
57#include "valgrind_internal.h"
58
59/*
60 * A node that has been deleted is left untouched for this many delete
61 * cycles. Reads have guaranteed correctness if they took no longer than
62 * DELETED_LIFE concurrent deletes, otherwise they notice something is
63 * wrong and restart. The memory of deleted nodes is never freed to
64 * malloc nor their pointers lead anywhere wrong, thus a stale read will
65 * (temporarily) get a wrong answer but won't crash.
66 *
67 * There's no need to count writes as they never interfere with reads.
68 *
69 * Allowing stale reads (of arbitrarily old writes or of deletes less than
70 * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect
71 * semantics in any way: the thread could have been stalled just after
72 * returning from our code. Thus, the guarantee is: the result of get() or
73 * find_le() is a value that was current at any point between the call
74 * start and end.
75 */
76#define DELETED_LIFE 16
77
78#define SLICE 4
79#define NIB ((1ULL << SLICE) - 1)
80#define SLNODES (1 << SLICE)
81
82typedef unsigned char sh_t;
83
84struct critnib_node {
85 /*
86 * path is the part of a tree that's already traversed (be it through
87 * explicit nodes or collapsed links) -- ie, any subtree below has all
88 * those bits set to this value.
89 *
90 * nib is a 4-bit slice that's an index into the node's children.
91 *
92 * shift is the length (in bits) of the part of the key below this node.
93 *
94 * nib
95 * |XXXXXXXXXX|?|*****|
96 * path ^
97 * +-----+
98 * shift
99 */
100 struct critnib_node *child[SLNODES];
101 uint64_t path;
102 sh_t shift;
103};
104
105struct critnib_leaf {
106 uint64_t key;
107 void *value;
108};
109
110struct critnib {
111 struct critnib_node *root;
112
113 /* pool of freed nodes: singly linked list, next at child[0] */
114 struct critnib_node *deleted_node;
115 struct critnib_leaf *deleted_leaf;
116
117 /* nodes removed but not yet eligible for reuse */
118 struct critnib_node *pending_del_nodes[DELETED_LIFE];
119 struct critnib_leaf *pending_del_leaves[DELETED_LIFE];
120
121 uint64_t remove_count;
122
123 os_mutex_t mutex; /* writes/removes */
124};
125
126/*
127 * atomic load
128 */
129static void
130load(void *src, void *dst)
131{
132 util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst,
133 memory_order_acquire);
134}
135
136/*
137 * atomic store
138 */
139static void
140store(void *dst, void *src)
141{
142 util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src,
143 memory_order_release);
144}
145
146/*
147 * internal: is_leaf -- check tagged pointer for leafness
148 */
149static inline bool
150is_leaf(struct critnib_node *n)
151{
152 return (uint64_t)n & 1;
153}
154
155/*
156 * internal: to_leaf -- untag a leaf pointer
157 */
158static inline struct critnib_leaf *
159to_leaf(struct critnib_node *n)
160{
161 return (void *)((uint64_t)n & ~1ULL);
162}
163
164/*
165 * internal: path_mask -- return bit mask of a path above a subtree [shift]
166 * bits tall
167 */
168static inline uint64_t
169path_mask(sh_t shift)
170{
171 return ~NIB << shift;
172}
173
174/*
175 * internal: slice_index -- return index of child at the given nib
176 */
177static inline unsigned
178slice_index(uint64_t key, sh_t shift)
179{
180 return (unsigned)((key >> shift) & NIB);
181}
182
183/*
184 * critnib_new -- allocates a new critnib structure
185 */
186struct critnib *
187critnib_new(void)
188{
189 struct critnib *c = Zalloc(sizeof(struct critnib));
190 if (!c)
191 return NULL;
192
193 util_mutex_init(&c->mutex);
194
195 VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root));
196 VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count,
197 sizeof(c->remove_count));
198
199 return c;
200}
201
202/*
203 * internal: delete_node -- recursively free (to malloc) a subtree
204 */
205static void
206delete_node(struct critnib_node *__restrict n)
207{
208 if (!is_leaf(n)) {
209 for (int i = 0; i < SLNODES; i++) {
210 if (n->child[i])
211 delete_node(n->child[i]);
212 }
213
214 Free(n);
215 } else {
216 Free(to_leaf(n));
217 }
218}
219
220/*
221 * critnib_delete -- destroy and free a critnib struct
222 */
223void
224critnib_delete(struct critnib *c)
225{
226 if (c->root)
227 delete_node(c->root);
228
229 util_mutex_destroy(&c->mutex);
230
231 for (struct critnib_node *m = c->deleted_node; m; ) {
232 struct critnib_node *mm = m->child[0];
233 Free(m);
234 m = mm;
235 }
236
237 for (struct critnib_leaf *k = c->deleted_leaf; k; ) {
238 struct critnib_leaf *kk = k->value;
239 Free(k);
240 k = kk;
241 }
242
243 for (int i = 0; i < DELETED_LIFE; i++) {
244 Free(c->pending_del_nodes[i]);
245 Free(c->pending_del_leaves[i]);
246 }
247
248 Free(c);
249}
250
251/*
252 * internal: free_node -- free (to internal pool, not malloc) a node.
253 *
254 * We cannot free them to malloc as a stalled reader thread may still walk
255 * through such nodes; it will notice the result being bogus but only after
256 * completing the walk, thus we need to ensure any freed nodes still point
257 * to within the critnib structure.
258 */
259static void
260free_node(struct critnib *__restrict c, struct critnib_node *__restrict n)
261{
262 if (!n)
263 return;
264
265 ASSERT(!is_leaf(n));
266 n->child[0] = c->deleted_node;
267 c->deleted_node = n;
268}
269
270/*
271 * internal: alloc_node -- allocate a node from our pool or from malloc
272 */
273static struct critnib_node *
274alloc_node(struct critnib *__restrict c)
275{
276 if (!c->deleted_node) {
277 struct critnib_node *n = Malloc(sizeof(struct critnib_node));
278 if (n == NULL)
279 ERR("!Malloc");
280
281 return n;
282 }
283
284 struct critnib_node *n = c->deleted_node;
285
286 c->deleted_node = n->child[0];
287 VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n));
288
289 return n;
290}
291
292/*
293 * internal: free_leaf -- free (to internal pool, not malloc) a leaf.
294 *
295 * See free_node().
296 */
297static void
298free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k)
299{
300 if (!k)
301 return;
302
303 k->value = c->deleted_leaf;
304 c->deleted_leaf = k;
305}
306
307/*
308 * internal: alloc_leaf -- allocate a leaf from our pool or from malloc
309 */
310static struct critnib_leaf *
311alloc_leaf(struct critnib *__restrict c)
312{
313 if (!c->deleted_leaf) {
314 struct critnib_leaf *k = Malloc(sizeof(struct critnib_leaf));
315 if (k == NULL)
316 ERR("!Malloc");
317
318 return k;
319 }
320
321 struct critnib_leaf *k = c->deleted_leaf;
322
323 c->deleted_leaf = k->value;
324 VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k));
325
326 return k;
327}
328
329/*
330 * crinib_insert -- write a key:value pair to the critnib structure
331 *
332 * Returns:
333 * • 0 on success
334 * • EEXIST if such a key already exists
335 * • ENOMEM if we're out of memory
336 *
337 * Takes a global write lock but doesn't stall any readers.
338 */
339int
340critnib_insert(struct critnib *c, uint64_t key, void *value)
341{
342 util_mutex_lock(&c->mutex);
343
344 struct critnib_leaf *k = alloc_leaf(c);
345 if (!k) {
346 util_mutex_unlock(&c->mutex);
347
348 return ENOMEM;
349 }
350
351 VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf));
352
353 k->key = key;
354 k->value = value;
355
356 struct critnib_node *kn = (void *)((uint64_t)k | 1);
357
358 struct critnib_node *n = c->root;
359 if (!n) {
360 c->root = kn;
361
362 util_mutex_unlock(&c->mutex);
363
364 return 0;
365 }
366
367 struct critnib_node **parent = &c->root;
368 struct critnib_node *prev = c->root;
369
370 while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) {
371 prev = n;
372 parent = &n->child[slice_index(key, n->shift)];
373 n = *parent;
374 }
375
376 if (!n) {
377 n = prev;
378 store(&n->child[slice_index(key, n->shift)], kn);
379
380 util_mutex_unlock(&c->mutex);
381
382 return 0;
383 }
384
385 uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path;
386 /* Find where the path differs from our key. */
387 uint64_t at = path ^ key;
388 if (!at) {
389 ASSERT(is_leaf(n));
390 free_leaf(c, to_leaf(kn));
391 /* fail instead of replacing */
392
393 util_mutex_unlock(&c->mutex);
394
395 return EEXIST;
396 }
397
398 /* and convert that to an index. */
399 sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1);
400
401 struct critnib_node *m = alloc_node(c);
402 if (!m) {
403 free_leaf(c, to_leaf(kn));
404
405 util_mutex_unlock(&c->mutex);
406
407 return ENOMEM;
408 }
409 VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node));
410
411 for (int i = 0; i < SLNODES; i++)
412 m->child[i] = NULL;
413
414 m->child[slice_index(key, sh)] = kn;
415 m->child[slice_index(path, sh)] = n;
416 m->shift = sh;
417 m->path = key & path_mask(sh);
418 store(parent, m);
419
420 util_mutex_unlock(&c->mutex);
421
422 return 0;
423}
424
425/*
426 * critnib_remove -- delete a key from the critnib structure, return its value
427 */
428void *
429critnib_remove(struct critnib *c, uint64_t key)
430{
431 struct critnib_leaf *k;
432 void *value = NULL;
433
434 util_mutex_lock(&c->mutex);
435
436 struct critnib_node *n = c->root;
437 if (!n)
438 goto not_found;
439
440 uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE;
441 free_node(c, c->pending_del_nodes[del]);
442 free_leaf(c, c->pending_del_leaves[del]);
443 c->pending_del_nodes[del] = NULL;
444 c->pending_del_leaves[del] = NULL;
445
446 if (is_leaf(n)) {
447 k = to_leaf(n);
448 if (k->key == key) {
449 store(&c->root, NULL);
450 goto del_leaf;
451 }
452
453 goto not_found;
454 }
455 /*
456 * n and k are a parent:child pair (after the first iteration); k is the
457 * leaf that holds the key we're deleting.
458 */
459 struct critnib_node **k_parent = &c->root;
460 struct critnib_node **n_parent = &c->root;
461 struct critnib_node *kn = n;
462
463 while (!is_leaf(kn)) {
464 n_parent = k_parent;
465 n = kn;
466 k_parent = &kn->child[slice_index(key, kn->shift)];
467 kn = *k_parent;
468
469 if (!kn)
470 goto not_found;
471 }
472
473 k = to_leaf(kn);
474 if (k->key != key)
475 goto not_found;
476
477 store(&n->child[slice_index(key, n->shift)], NULL);
478
479 /* Remove the node if there's only one remaining child. */
480 int ochild = -1;
481 for (int i = 0; i < SLNODES; i++) {
482 if (n->child[i]) {
483 if (ochild != -1)
484 goto del_leaf;
485
486 ochild = i;
487 }
488 }
489
490 ASSERTne(ochild, -1);
491
492 store(n_parent, n->child[ochild]);
493 c->pending_del_nodes[del] = n;
494
495del_leaf:
496 value = k->value;
497 c->pending_del_leaves[del] = k;
498
499not_found:
500 util_mutex_unlock(&c->mutex);
501 return value;
502}
503
504/*
505 * critnib_get -- query for a key ("==" match), returns value or NULL
506 *
507 * Doesn't need a lock but if many deletes happened while our thread was
508 * somehow stalled the query is restarted (as freed nodes remain unused only
509 * for a grace period).
510 *
511 * Counterintuitively, it's pointless to return the most current answer,
512 * we need only one that was valid at any point after the call started.
513 */
514void *
515critnib_get(struct critnib *c, uint64_t key)
516{
517 uint64_t wrs1, wrs2;
518 void *res;
519
520 do {
521 struct critnib_node *n;
522
523 load(&c->remove_count, &wrs1);
524 load(&c->root, &n);
525
526 /*
527 * critbit algorithm: dive into the tree, looking at nothing but
528 * each node's critical bit^H^H^Hnibble. This means we risk
529 * going wrong way if our path is missing, but that's ok...
530 */
531 while (n && !is_leaf(n))
532 load(&n->child[slice_index(key, n->shift)], &n);
533
534 /* ... as we check it at the end. */
535 struct critnib_leaf *k = to_leaf(n);
536 res = (n && k->key == key) ? k->value : NULL;
537 load(&c->remove_count, &wrs2);
538 } while (wrs1 + DELETED_LIFE <= wrs2);
539
540 return res;
541}
542
543/*
544 * internal: find_successor -- return the rightmost non-null node in a subtree
545 */
546static void *
547find_successor(struct critnib_node *__restrict n)
548{
549 while (1) {
550 int nib;
551 for (nib = NIB; nib >= 0; nib--)
552 if (n->child[nib])
553 break;
554
555 if (nib < 0)
556 return NULL;
557
558 n = n->child[nib];
559 if (is_leaf(n))
560 return to_leaf(n)->value;
561 }
562}
563
564/*
565 * internal: find_le -- recursively search <= in a subtree
566 */
567static void *
568find_le(struct critnib_node *__restrict n, uint64_t key)
569{
570 if (!n)
571 return NULL;
572
573 if (is_leaf(n)) {
574 struct critnib_leaf *k = to_leaf(n);
575
576 return (k->key <= key) ? k->value : NULL;
577 }
578
579 /*
580 * is our key outside the subtree we're in?
581 *
582 * If we're inside, all bits above the nib will be identical; note
583 * that shift points at the nib's lower rather than upper edge, so it
584 * needs to be masked away as well.
585 */
586 if ((key ^ n->path) >> (n->shift) & ~NIB) {
587 /*
588 * subtree is too far to the left?
589 * -> its rightmost value is good
590 */
591 if (n->path < key)
592 return find_successor(n);
593
594 /*
595 * subtree is too far to the right?
596 * -> it has nothing of interest to us
597 */
598 return NULL;
599 }
600
601 unsigned nib = slice_index(key, n->shift);
602 /* recursive call: follow the path */
603 {
604 struct critnib_node *m;
605 load(&n->child[nib], &m);
606 void *value = find_le(m, key);
607 if (value)
608 return value;
609 }
610
611 /*
612 * nothing in that subtree? We strayed from the path at this point,
613 * thus need to search every subtree to our left in this node. No
614 * need to dive into any but the first non-null, though.
615 */
616 for (; nib > 0; nib--) {
617 struct critnib_node *m;
618 load(&n->child[nib - 1], &m);
619 if (m) {
620 n = m;
621 if (is_leaf(n))
622 return to_leaf(n)->value;
623
624 return find_successor(n);
625 }
626 }
627
628 return NULL;
629}
630
631/*
632 * critnib_find_le -- query for a key ("<=" match), returns value or NULL
633 *
634 * Same guarantees as critnib_get().
635 */
636void *
637critnib_find_le(struct critnib *c, uint64_t key)
638{
639 uint64_t wrs1, wrs2;
640 void *res;
641
642 do {
643 load(&c->remove_count, &wrs1);
644 struct critnib_node *n; /* avoid a subtle TOCTOU */
645 load(&c->root, &n);
646 res = n ? find_le(n, key) : NULL;
647 load(&c->remove_count, &wrs2);
648 } while (wrs1 + DELETED_LIFE <= wrs2);
649
650 return res;
651}