[ceph.git] / ceph / src / pmdk / src / libpmemobj / critnib.c

// SPDX-License-Identifier: BSD-3-Clause
/* Copyright 2018-2019, Intel Corporation */

/*
 * critnib.c -- implementation of critnib tree
 *
 * It offers identity lookup (like a hashmap) and <= lookup (like a search
 * tree).  Unlike some hashing algorithms (cuckoo hash, perfect hashing) the
 * complexity isn't constant, but for data sizes we expect it's several
 * times as fast as cuckoo, and has no "stop the world" cases that would
 * cause latency (ie, better worst case behaviour).
 */

/*
 * STRUCTURE DESCRIPTION
 *
 * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit:
 * it skips nodes for uninteresting radix nodes (ie, ones that would have
 * exactly one child), this requires adding to every node a field that
 * describes the slice (4-bit in our case) that this radix level is for.
 *
 * This implementation also stores each node's path (ie, bits that are
 * common to every key in that subtree) -- this doesn't help with lookups
 * at all (unused in == match, could be reconstructed at no cost in <=
 * after first dive) but simplifies inserts and removes.  If we ever want
 * that piece of memory it's easy to trim it down.
 */

/*
 * CONCURRENCY ISSUES
 *
 * Reads are completely lock-free sync-free, but only almost wait-free:
 * if for some reason a read thread gets pathologically stalled, it will
 * notice the data being stale and restart the work.  In usual cases,
 * the structure having been modified does _not_ cause a restart.
 *
 * Writes could be easily made lock-free as well (with only a cmpxchg
 * sync), but this leads to problems with removes.  A possible solution
 * would be doing removes by overwriting by NULL w/o freeing -- yet this
 * would lead to the structure growing without bounds.  Complex per-node
 * locks would increase concurrency but they slow down individual writes
 * enough that in practice a simple global write lock works faster.
 *
 * Removes are the only operation that can break reads.  The structure
 * can do local RCU well -- the problem being knowing when it's safe to
 * free.  Any synchronization with reads would kill their speed, thus
 * instead we have a remove count.  The grace period is DELETED_LIFE,
 * after which any read will notice staleness and restart its work.
 */
#include <errno.h>
#include <stdbool.h>

#include "alloc.h"
#include "critnib.h"
#include "out.h"
#include "sys_util.h"
#include "valgrind_internal.h"

/*
 * A node that has been deleted is left untouched for this many delete
 * cycles.  Reads have guaranteed correctness if they took no longer than
 * DELETED_LIFE concurrent deletes, otherwise they notice something is
 * wrong and restart.  The memory of deleted nodes is never freed to
 * malloc nor their pointers lead anywhere wrong, thus a stale read will
 * (temporarily) get a wrong answer but won't crash.
 *
 * There's no need to count writes as they never interfere with reads.
 *
 * Allowing stale reads (of arbitrarily old writes or of deletes less than
 * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect
 * semantics in any way: the thread could have been stalled just after
 * returning from our code.  Thus, the guarantee is: the result of get() or
 * find_le() is a value that was current at any point between the call
 * start and end.
 */
#define DELETED_LIFE 16

#define SLICE 4
#define NIB ((1ULL << SLICE) - 1)
#define SLNODES (1 << SLICE)

typedef unsigned char sh_t;

struct critnib_node {
	/*
	 * path is the part of a tree that's already traversed (be it through
	 * explicit nodes or collapsed links) -- ie, any subtree below has all
	 * those bits set to this value.
	 *
	 * nib is a 4-bit slice that's an index into the node's children.
	 *
	 * shift is the length (in bits) of the part of the key below this node.
	 *
	 *            nib
	 * |XXXXXXXXXX|?|*****|
	 *    path      ^
	 *              +-----+
	 *               shift
	 */
	struct critnib_node *child[SLNODES];
	uint64_t path;
	sh_t shift;
};

struct critnib_leaf {
	uint64_t key;
	void *value;
};

struct critnib {
	struct critnib_node *root;

	/* pool of freed nodes: singly linked list, next at child[0] */
	struct critnib_node *deleted_node;
	struct critnib_leaf *deleted_leaf;

	/* nodes removed but not yet eligible for reuse */
	struct critnib_node *pending_del_nodes[DELETED_LIFE];
	struct critnib_leaf *pending_del_leaves[DELETED_LIFE];

	uint64_t remove_count;

	os_mutex_t mutex; /* writes/removes */
};

/*
 * atomic load
 */
static void
load(void *src, void *dst)
{
	util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst,
		memory_order_acquire);
}

/*
 * atomic store
 */
static void
store(void *dst, void *src)
{
	util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src,
		memory_order_release);
}

/*
 * internal: is_leaf -- check tagged pointer for leafness
 */
static inline bool
is_leaf(struct critnib_node *n)
{
	return (uint64_t)n & 1;
}

/*
 * internal: to_leaf -- untag a leaf pointer
 */
static inline struct critnib_leaf *
to_leaf(struct critnib_node *n)
{
	return (void *)((uint64_t)n & ~1ULL);
}

/*
 * internal: path_mask -- return bit mask of a path above a subtree [shift]
 * bits tall
 */
static inline uint64_t
path_mask(sh_t shift)
{
	return ~NIB << shift;
}

/*
 * internal: slice_index -- return index of child at the given nib
 */
static inline unsigned
slice_index(uint64_t key, sh_t shift)
{
	return (unsigned)((key >> shift) & NIB);
}

/*
 * critnib_new -- allocates a new critnib structure
 */
struct critnib *
critnib_new(void)
{
	struct critnib *c = Zalloc(sizeof(struct critnib));
	if (!c)
		return NULL;

	util_mutex_init(&c->mutex);

	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root));
	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count,
					sizeof(c->remove_count));

	return c;
}

/*
 * internal: delete_node -- recursively free (to malloc) a subtree
 */
static void
delete_node(struct critnib_node *__restrict n)
{
	if (!is_leaf(n)) {
		for (int i = 0; i < SLNODES; i++) {
			if (n->child[i])
				delete_node(n->child[i]);
		}

		Free(n);
	} else {
		Free(to_leaf(n));
	}
}

/*
 * critnib_delete -- destroy and free a critnib struct
 */
void
critnib_delete(struct critnib *c)
{
	if (c->root)
		delete_node(c->root);

	util_mutex_destroy(&c->mutex);

	for (struct critnib_node *m = c->deleted_node; m; ) {
		struct critnib_node *mm = m->child[0];
		Free(m);
		m = mm;
	}

	for (struct critnib_leaf *k = c->deleted_leaf; k; ) {
		struct critnib_leaf *kk = k->value;
		Free(k);
		k = kk;
	}

	for (int i = 0; i < DELETED_LIFE; i++) {
		Free(c->pending_del_nodes[i]);
		Free(c->pending_del_leaves[i]);
	}

	Free(c);
}

/*
 * internal: free_node -- free (to internal pool, not malloc) a node.
 *
 * We cannot free them to malloc as a stalled reader thread may still walk
 * through such nodes; it will notice the result being bogus but only after
 * completing the walk, thus we need to ensure any freed nodes still point
 * to within the critnib structure.
 */
static void
free_node(struct critnib *__restrict c, struct critnib_node *__restrict n)
{
	if (!n)
		return;

	ASSERT(!is_leaf(n));
	n->child[0] = c->deleted_node;
	c->deleted_node = n;
}

/*
 * internal: alloc_node -- allocate a node from our pool or from malloc
 */
static struct critnib_node *
alloc_node(struct critnib *__restrict c)
{
	if (!c->deleted_node) {
		struct critnib_node *n = Malloc(sizeof(struct critnib_node));
		if (n == NULL)
			ERR("!Malloc");

		return n;
	}

	struct critnib_node *n = c->deleted_node;

	c->deleted_node = n->child[0];
	VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n));

	return n;
}

/*
 * internal: free_leaf -- free (to internal pool, not malloc) a leaf.
 *
 * See free_node().
 */
static void
free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k)
{
	if (!k)
		return;

	k->value = c->deleted_leaf;
	c->deleted_leaf = k;
}

/*
 * internal: alloc_leaf -- allocate a leaf from our pool or from malloc
 */
static struct critnib_leaf *
alloc_leaf(struct critnib *__restrict c)
{
	if (!c->deleted_leaf) {
		struct critnib_leaf *k = Malloc(sizeof(struct critnib_leaf));
		if (k == NULL)
			ERR("!Malloc");

		return k;
	}

	struct critnib_leaf *k = c->deleted_leaf;

	c->deleted_leaf = k->value;
	VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k));

	return k;
}

/*
 * crinib_insert -- write a key:value pair to the critnib structure
 *
 * Returns:
 *  • 0 on success
 *  • EEXIST if such a key already exists
 *  • ENOMEM if we're out of memory
 *
 * Takes a global write lock but doesn't stall any readers.
 */
int
critnib_insert(struct critnib *c, uint64_t key, void *value)
{
	util_mutex_lock(&c->mutex);

	struct critnib_leaf *k = alloc_leaf(c);
	if (!k) {
		util_mutex_unlock(&c->mutex);

		return ENOMEM;
	}

	VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf));

	k->key = key;
	k->value = value;

	struct critnib_node *kn = (void *)((uint64_t)k | 1);

	struct critnib_node *n = c->root;
	if (!n) {
		c->root = kn;

		util_mutex_unlock(&c->mutex);

		return 0;
	}

	struct critnib_node **parent = &c->root;
	struct critnib_node *prev = c->root;

	while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) {
		prev = n;
		parent = &n->child[slice_index(key, n->shift)];
		n = *parent;
	}

	if (!n) {
		n = prev;
		store(&n->child[slice_index(key, n->shift)], kn);

		util_mutex_unlock(&c->mutex);

		return 0;
	}

	uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path;
	/* Find where the path differs from our key. */
	uint64_t at = path ^ key;
	if (!at) {
		ASSERT(is_leaf(n));
		free_leaf(c, to_leaf(kn));
		/* fail instead of replacing */

		util_mutex_unlock(&c->mutex);

		return EEXIST;
	}

	/* and convert that to an index. */
	sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1);

	struct critnib_node *m = alloc_node(c);
	if (!m) {
		free_leaf(c, to_leaf(kn));

		util_mutex_unlock(&c->mutex);

		return ENOMEM;
	}
	VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node));

	for (int i = 0; i < SLNODES; i++)
		m->child[i] = NULL;

	m->child[slice_index(key, sh)] = kn;
	m->child[slice_index(path, sh)] = n;
	m->shift = sh;
	m->path = key & path_mask(sh);
	store(parent, m);

	util_mutex_unlock(&c->mutex);

	return 0;
}

/*
 * critnib_remove -- delete a key from the critnib structure, return its value
 */
void *
critnib_remove(struct critnib *c, uint64_t key)
{
	struct critnib_leaf *k;
	void *value = NULL;

	util_mutex_lock(&c->mutex);

	struct critnib_node *n = c->root;
	if (!n)
		goto not_found;

	uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE;
	free_node(c, c->pending_del_nodes[del]);
	free_leaf(c, c->pending_del_leaves[del]);
	c->pending_del_nodes[del] = NULL;
	c->pending_del_leaves[del] = NULL;

	if (is_leaf(n)) {
		k = to_leaf(n);
		if (k->key == key) {
			store(&c->root, NULL);
			goto del_leaf;
		}

		goto not_found;
	}
	/*
	 * n and k are a parent:child pair (after the first iteration); k is the
	 * leaf that holds the key we're deleting.
	 */
	struct critnib_node **k_parent = &c->root;
	struct critnib_node **n_parent = &c->root;
	struct critnib_node *kn = n;

	while (!is_leaf(kn)) {
		n_parent = k_parent;
		n = kn;
		k_parent = &kn->child[slice_index(key, kn->shift)];
		kn = *k_parent;

		if (!kn)
			goto not_found;
	}

	k = to_leaf(kn);
	if (k->key != key)
		goto not_found;

	store(&n->child[slice_index(key, n->shift)], NULL);

	/* Remove the node if there's only one remaining child. */
	int ochild = -1;
	for (int i = 0; i < SLNODES; i++) {
		if (n->child[i]) {
			if (ochild != -1)
				goto del_leaf;

			ochild = i;
		}
	}

	ASSERTne(ochild, -1);

	store(n_parent, n->child[ochild]);
	c->pending_del_nodes[del] = n;

del_leaf:
	value = k->value;
	c->pending_del_leaves[del] = k;

not_found:
	util_mutex_unlock(&c->mutex);
	return value;
}

/*
 * critnib_get -- query for a key ("==" match), returns value or NULL
 *
 * Doesn't need a lock but if many deletes happened while our thread was
 * somehow stalled the query is restarted (as freed nodes remain unused only
 * for a grace period).
 *
 * Counterintuitively, it's pointless to return the most current answer,
 * we need only one that was valid at any point after the call started.
 */
void *
critnib_get(struct critnib *c, uint64_t key)
{
	uint64_t wrs1, wrs2;
	void *res;

	do {
		struct critnib_node *n;

		load(&c->remove_count, &wrs1);
		load(&c->root, &n);

		/*
		 * critbit algorithm: dive into the tree, looking at nothing but
		 * each node's critical bit^H^H^Hnibble.  This means we risk
		 * going wrong way if our path is missing, but that's ok...
		 */
		while (n && !is_leaf(n))
			load(&n->child[slice_index(key, n->shift)], &n);

		/* ... as we check it at the end. */
		struct critnib_leaf *k = to_leaf(n);
		res = (n && k->key == key) ? k->value : NULL;
		load(&c->remove_count, &wrs2);
	} while (wrs1 + DELETED_LIFE <= wrs2);

	return res;
}

/*
 * internal: find_successor -- return the rightmost non-null node in a subtree
 */
static void *
find_successor(struct critnib_node *__restrict n)
{
	while (1) {
		int nib;
		for (nib = NIB; nib >= 0; nib--)
			if (n->child[nib])
				break;

		if (nib < 0)
			return NULL;

		n = n->child[nib];
		if (is_leaf(n))
			return to_leaf(n)->value;
	}
}

/*
 * internal: find_le -- recursively search <= in a subtree
 */
static void *
find_le(struct critnib_node *__restrict n, uint64_t key)
{
	if (!n)
		return NULL;

	if (is_leaf(n)) {
		struct critnib_leaf *k = to_leaf(n);

		return (k->key <= key) ? k->value : NULL;
	}

	/*
	 * is our key outside the subtree we're in?
	 *
	 * If we're inside, all bits above the nib will be identical; note
	 * that shift points at the nib's lower rather than upper edge, so it
	 * needs to be masked away as well.
	 */
	if ((key ^ n->path) >> (n->shift) & ~NIB) {
		/*
		 * subtree is too far to the left?
		 * -> its rightmost value is good
		 */
		if (n->path < key)
			return find_successor(n);

		/*
		 * subtree is too far to the right?
		 * -> it has nothing of interest to us
		 */
		return NULL;
	}

	unsigned nib = slice_index(key, n->shift);
	/* recursive call: follow the path */
	{
		struct critnib_node *m;
		load(&n->child[nib], &m);
		void *value = find_le(m, key);
		if (value)
			return value;
	}

	/*
	 * nothing in that subtree?  We strayed from the path at this point,
	 * thus need to search every subtree to our left in this node.  No
	 * need to dive into any but the first non-null, though.
	 */
	for (; nib > 0; nib--) {
		struct critnib_node *m;
		load(&n->child[nib - 1], &m);
		if (m) {
			n = m;
			if (is_leaf(n))
				return to_leaf(n)->value;

			return find_successor(n);
		}
	}

	return NULL;
}

/*
 * critnib_find_le -- query for a key ("<=" match), returns value or NULL
 *
 * Same guarantees as critnib_get().
 */
void *
critnib_find_le(struct critnib *c, uint64_t key)
{
	uint64_t wrs1, wrs2;
	void *res;

	do {
		load(&c->remove_count, &wrs1);
		struct critnib_node *n; /* avoid a subtle TOCTOU */
		load(&c->root, &n);
		res = n ? find_le(n, key) : NULL;
		load(&c->remove_count, &wrs2);
	} while (wrs1 + DELETED_LIFE <= wrs2);

	return res;
}
Commit	Line	Data
a4b75251 TL	1	// SPDX-License-Identifier: BSD-3-Clause
	2	/* Copyright 2018-2019, Intel Corporation */
	3
	4	/*
	5	* critnib.c -- implementation of critnib tree
	6	*
	7	* It offers identity lookup (like a hashmap) and <= lookup (like a search
	8	* tree). Unlike some hashing algorithms (cuckoo hash, perfect hashing) the
	9	* complexity isn't constant, but for data sizes we expect it's several
	10	* times as fast as cuckoo, and has no "stop the world" cases that would
	11	* cause latency (ie, better worst case behaviour).
	12	*/
	13
	14	/*
	15	* STRUCTURE DESCRIPTION
	16	*
	17	* Critnib is a hybrid between a radix tree and DJ Bernstein's critbit:
	18	* it skips nodes for uninteresting radix nodes (ie, ones that would have
	19	* exactly one child), this requires adding to every node a field that
	20	* describes the slice (4-bit in our case) that this radix level is for.
	21	*
	22	* This implementation also stores each node's path (ie, bits that are
	23	* common to every key in that subtree) -- this doesn't help with lookups
	24	* at all (unused in == match, could be reconstructed at no cost in <=
	25	* after first dive) but simplifies inserts and removes. If we ever want
	26	* that piece of memory it's easy to trim it down.
	27	*/
	28
	29	/*
	30	* CONCURRENCY ISSUES
	31	*
	32	* Reads are completely lock-free sync-free, but only almost wait-free:
	33	* if for some reason a read thread gets pathologically stalled, it will
	34	* notice the data being stale and restart the work. In usual cases,
	35	* the structure having been modified does _not_ cause a restart.
	36	*
	37	* Writes could be easily made lock-free as well (with only a cmpxchg
	38	* sync), but this leads to problems with removes. A possible solution
	39	* would be doing removes by overwriting by NULL w/o freeing -- yet this
	40	* would lead to the structure growing without bounds. Complex per-node
	41	* locks would increase concurrency but they slow down individual writes
	42	* enough that in practice a simple global write lock works faster.
	43	*
	44	* Removes are the only operation that can break reads. The structure
	45	* can do local RCU well -- the problem being knowing when it's safe to
	46	* free. Any synchronization with reads would kill their speed, thus
	47	* instead we have a remove count. The grace period is DELETED_LIFE,
	48	* after which any read will notice staleness and restart its work.
	49	*/
	50	#include <errno.h>
	51	#include <stdbool.h>
	52
	53	#include "alloc.h"
	54	#include "critnib.h"
	55	#include "out.h"
	56	#include "sys_util.h"
	57	#include "valgrind_internal.h"
	58
	59	/*
	60	* A node that has been deleted is left untouched for this many delete
	61	* cycles. Reads have guaranteed correctness if they took no longer than
	62	* DELETED_LIFE concurrent deletes, otherwise they notice something is
	63	* wrong and restart. The memory of deleted nodes is never freed to
	64	* malloc nor their pointers lead anywhere wrong, thus a stale read will
65	* (temporarily) get a wrong answer but won't crash.
66	*
67	* There's no need to count writes as they never interfere with reads.
68	*
69	* Allowing stale reads (of arbitrarily old writes or of deletes less than
70	* DELETED_LIFE old) might sound counterintuitive, but it doesn't affect
71	* semantics in any way: the thread could have been stalled just after
72	* returning from our code. Thus, the guarantee is: the result of get() or
73	* find_le() is a value that was current at any point between the call
74	* start and end.
75	*/
76	#define DELETED_LIFE 16
77
78	#define SLICE 4
79	#define NIB ((1ULL << SLICE) - 1)
80	#define SLNODES (1 << SLICE)
81
82	typedef unsigned char sh_t;
83
84	struct critnib_node {
85	/*
86	* path is the part of a tree that's already traversed (be it through
87	* explicit nodes or collapsed links) -- ie, any subtree below has all
88	* those bits set to this value.
89	*
90	* nib is a 4-bit slice that's an index into the node's children.
91	*
92	* shift is the length (in bits) of the part of the key below this node.
93	*
94	* nib
95	* \|XXXXXXXXXX\|?\|*****\|
96	* path ^
97	* +-----+
98	* shift
99	*/
100	struct critnib_node *child[SLNODES];
101	uint64_t path;
102	sh_t shift;
103	};
104
105	struct critnib_leaf {
106	uint64_t key;
107	void *value;
108	};
109
110	struct critnib {
111	struct critnib_node *root;
112
113	/* pool of freed nodes: singly linked list, next at child[0] */
114	struct critnib_node *deleted_node;
115	struct critnib_leaf *deleted_leaf;
116
117	/* nodes removed but not yet eligible for reuse */
118	struct critnib_node *pending_del_nodes[DELETED_LIFE];
119	struct critnib_leaf *pending_del_leaves[DELETED_LIFE];
120
121	uint64_t remove_count;
122
123	os_mutex_t mutex; /* writes/removes */
124	};
125
126	/*
127	* atomic load
128	*/
129	static void
130	load(void src, void dst)
131	{
132	util_atomic_load_explicit64((uint64_t )src, (uint64_t )dst,
133	memory_order_acquire);
134	}
135
136	/*
137	* atomic store
138	*/
139	static void
140	store(void dst, void src)
141	{
142	util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src,
143	memory_order_release);
144	}
145
146	/*
147	* internal: is_leaf -- check tagged pointer for leafness
148	*/
149	static inline bool
150	is_leaf(struct critnib_node *n)
151	{
152	return (uint64_t)n & 1;
153	}
154
155	/*
156	* internal: to_leaf -- untag a leaf pointer
157	*/
158	static inline struct critnib_leaf *
159	to_leaf(struct critnib_node *n)
160	{
161	return (void *)((uint64_t)n & ~1ULL);
162	}
163
164	/*
165	* internal: path_mask -- return bit mask of a path above a subtree [shift]
166	* bits tall
167	*/
168	static inline uint64_t
169	path_mask(sh_t shift)
170	{
171	return ~NIB << shift;
172	}
173
174	/*
175	* internal: slice_index -- return index of child at the given nib
176	*/
177	static inline unsigned
178	slice_index(uint64_t key, sh_t shift)
179	{
180	return (unsigned)((key >> shift) & NIB);
181	}
182
183	/*
184	* critnib_new -- allocates a new critnib structure
185	*/
186	struct critnib *
187	critnib_new(void)
188	{
189	struct critnib *c = Zalloc(sizeof(struct critnib));
190	if (!c)
191	return NULL;
192
193	util_mutex_init(&c->mutex);
194
195	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root));
196	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count,
197	sizeof(c->remove_count));
198
199	return c;
200	}
201
202	/*
203	* internal: delete_node -- recursively free (to malloc) a subtree
204	*/
205	static void
206	delete_node(struct critnib_node *__restrict n)
207	{
208	if (!is_leaf(n)) {
209	for (int i = 0; i < SLNODES; i++) {
210	if (n->child[i])
211	delete_node(n->child[i]);
212	}
213
214	Free(n);
215	} else {
216	Free(to_leaf(n));
217	}
218	}
219
220	/*
221	* critnib_delete -- destroy and free a critnib struct
222	*/
223	void
224	critnib_delete(struct critnib *c)
225	{
226	if (c->root)
227	delete_node(c->root);
228
229	util_mutex_destroy(&c->mutex);
230
231	for (struct critnib_node *m = c->deleted_node; m; ) {
232	struct critnib_node *mm = m->child[0];
233	Free(m);
234	m = mm;
235	}
236
237	for (struct critnib_leaf *k = c->deleted_leaf; k; ) {
238	struct critnib_leaf *kk = k->value;
239	Free(k);
240	k = kk;
241	}
242
243	for (int i = 0; i < DELETED_LIFE; i++) {
244	Free(c->pending_del_nodes[i]);
245	Free(c->pending_del_leaves[i]);
246	}
247
248	Free(c);
249	}
250
251	/*
252	* internal: free_node -- free (to internal pool, not malloc) a node.
253	*
254	* We cannot free them to malloc as a stalled reader thread may still walk
255	* through such nodes; it will notice the result being bogus but only after
256	* completing the walk, thus we need to ensure any freed nodes still point
257	* to within the critnib structure.
258	*/
259	static void
260	free_node(struct critnib __restrict c, struct critnib_node __restrict n)
261	{
262	if (!n)
263	return;
264
265	ASSERT(!is_leaf(n));
266	n->child[0] = c->deleted_node;
267	c->deleted_node = n;
268	}
269
270	/*
271	* internal: alloc_node -- allocate a node from our pool or from malloc
272	*/
273	static struct critnib_node *
274	alloc_node(struct critnib *__restrict c)
275	{
276	if (!c->deleted_node) {
277	struct critnib_node *n = Malloc(sizeof(struct critnib_node));
278	if (n == NULL)
279	ERR("!Malloc");
280
281	return n;
282	}
283
284	struct critnib_node *n = c->deleted_node;
285
286	c->deleted_node = n->child[0];
287	VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n));
288
289	return n;
290	}
291
292	/*
293	* internal: free_leaf -- free (to internal pool, not malloc) a leaf.
294	*
295	* See free_node().
296	*/
297	static void
298	free_leaf(struct critnib __restrict c, struct critnib_leaf __restrict k)
299	{
300	if (!k)
301	return;
302
303	k->value = c->deleted_leaf;
304	c->deleted_leaf = k;
305	}
306
307	/*
308	* internal: alloc_leaf -- allocate a leaf from our pool or from malloc
309	*/
310	static struct critnib_leaf *
311	alloc_leaf(struct critnib *__restrict c)
312	{
313	if (!c->deleted_leaf) {
314	struct critnib_leaf *k = Malloc(sizeof(struct critnib_leaf));
315	if (k == NULL)
316	ERR("!Malloc");
317
318	return k;
319	}
320
321	struct critnib_leaf *k = c->deleted_leaf;
322
323	c->deleted_leaf = k->value;
324	VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k));
325
326	return k;
327	}
328
329	/*
330	* crinib_insert -- write a key:value pair to the critnib structure
331	*
332	* Returns:
333	* • 0 on success
334	* • EEXIST if such a key already exists
335	* • ENOMEM if we're out of memory
336	*
337	* Takes a global write lock but doesn't stall any readers.
338	*/
339	int
340	critnib_insert(struct critnib c, uint64_t key, void value)
341	{
342	util_mutex_lock(&c->mutex);
343
344	struct critnib_leaf *k = alloc_leaf(c);
345	if (!k) {
346	util_mutex_unlock(&c->mutex);
347
348	return ENOMEM;
349	}
350
351	VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf));
352
353	k->key = key;
354	k->value = value;
355
356	struct critnib_node kn = (void )((uint64_t)k \| 1);
357
358	struct critnib_node *n = c->root;
359	if (!n) {
360	c->root = kn;
361
362	util_mutex_unlock(&c->mutex);
363
364	return 0;
365	}
366
367	struct critnib_node **parent = &c->root;
368	struct critnib_node *prev = c->root;
369
370	while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) {
371	prev = n;
372	parent = &n->child[slice_index(key, n->shift)];
373	n = *parent;
374	}
375
376	if (!n) {
377	n = prev;
378	store(&n->child[slice_index(key, n->shift)], kn);
379
380	util_mutex_unlock(&c->mutex);
381
382	return 0;
383	}
384
385	uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path;
386	/* Find where the path differs from our key. */
387	uint64_t at = path ^ key;
388	if (!at) {
389	ASSERT(is_leaf(n));
390	free_leaf(c, to_leaf(kn));
391	/* fail instead of replacing */
392
393	util_mutex_unlock(&c->mutex);
394
395	return EEXIST;
396	}
397
398	/* and convert that to an index. */
399	sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1);
400
401	struct critnib_node *m = alloc_node(c);
402	if (!m) {
403	free_leaf(c, to_leaf(kn));
404
405	util_mutex_unlock(&c->mutex);
406
407	return ENOMEM;
408	}
409	VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node));
410
411	for (int i = 0; i < SLNODES; i++)
412	m->child[i] = NULL;
413
414	m->child[slice_index(key, sh)] = kn;
415	m->child[slice_index(path, sh)] = n;
416	m->shift = sh;
417	m->path = key & path_mask(sh);
418	store(parent, m);
419
420	util_mutex_unlock(&c->mutex);
421
422	return 0;
423	}
424
425	/*
426	* critnib_remove -- delete a key from the critnib structure, return its value
427	*/
428	void *
429	critnib_remove(struct critnib *c, uint64_t key)
430	{
431	struct critnib_leaf *k;
432	void *value = NULL;
433
434	util_mutex_lock(&c->mutex);
435
436	struct critnib_node *n = c->root;
437	if (!n)
438	goto not_found;
439
440	uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE;
441	free_node(c, c->pending_del_nodes[del]);
442	free_leaf(c, c->pending_del_leaves[del]);
443	c->pending_del_nodes[del] = NULL;
444	c->pending_del_leaves[del] = NULL;
445
446	if (is_leaf(n)) {
447	k = to_leaf(n);
448	if (k->key == key) {
449	store(&c->root, NULL);
450	goto del_leaf;
451	}
452
453	goto not_found;
454	}
455	/*
456	* n and k are a parent:child pair (after the first iteration); k is the
457	* leaf that holds the key we're deleting.
458	*/
459	struct critnib_node **k_parent = &c->root;
460	struct critnib_node **n_parent = &c->root;
461	struct critnib_node *kn = n;
462
463	while (!is_leaf(kn)) {
464	n_parent = k_parent;
465	n = kn;
466	k_parent = &kn->child[slice_index(key, kn->shift)];
467	kn = *k_parent;
468
469	if (!kn)
470	goto not_found;
471	}
472
473	k = to_leaf(kn);
474	if (k->key != key)
475	goto not_found;
476
477	store(&n->child[slice_index(key, n->shift)], NULL);
478
479	/* Remove the node if there's only one remaining child. */
480	int ochild = -1;
481	for (int i = 0; i < SLNODES; i++) {
482	if (n->child[i]) {
483	if (ochild != -1)
484	goto del_leaf;
485
486	ochild = i;
487	}
488	}
489
490	ASSERTne(ochild, -1);
491
492	store(n_parent, n->child[ochild]);
493	c->pending_del_nodes[del] = n;
494
495	del_leaf:
496	value = k->value;
497	c->pending_del_leaves[del] = k;
498
499	not_found:
500	util_mutex_unlock(&c->mutex);
501	return value;
502	}
503
504	/*
505	* critnib_get -- query for a key ("==" match), returns value or NULL
506	*
507	* Doesn't need a lock but if many deletes happened while our thread was
508	* somehow stalled the query is restarted (as freed nodes remain unused only
509	* for a grace period).
510	*
511	* Counterintuitively, it's pointless to return the most current answer,
512	* we need only one that was valid at any point after the call started.
513	*/
514	void *
515	critnib_get(struct critnib *c, uint64_t key)
516	{
517	uint64_t wrs1, wrs2;
518	void *res;
519
520	do {
521	struct critnib_node *n;
522
523	load(&c->remove_count, &wrs1);
524	load(&c->root, &n);
525
526	/*
527	* critbit algorithm: dive into the tree, looking at nothing but
528	* each node's critical bit^H^H^Hnibble. This means we risk
529	* going wrong way if our path is missing, but that's ok...
530	*/
531	while (n && !is_leaf(n))
532	load(&n->child[slice_index(key, n->shift)], &n);
533
534	/* ... as we check it at the end. */
535	struct critnib_leaf *k = to_leaf(n);
536	res = (n && k->key == key) ? k->value : NULL;
537	load(&c->remove_count, &wrs2);
538	} while (wrs1 + DELETED_LIFE <= wrs2);
539
540	return res;
541	}
542
543	/*
544	* internal: find_successor -- return the rightmost non-null node in a subtree
545	*/
546	static void *
547	find_successor(struct critnib_node *__restrict n)
548	{
549	while (1) {
550	int nib;
551	for (nib = NIB; nib >= 0; nib--)
552	if (n->child[nib])
553	break;
554
555	if (nib < 0)
556	return NULL;
557
558	n = n->child[nib];
559	if (is_leaf(n))
560	return to_leaf(n)->value;
561	}
562	}
563
564	/*
565	* internal: find_le -- recursively search <= in a subtree
566	*/
567	static void *
568	find_le(struct critnib_node *__restrict n, uint64_t key)
569	{
570	if (!n)
571	return NULL;
572
573	if (is_leaf(n)) {
574	struct critnib_leaf *k = to_leaf(n);
575
576	return (k->key <= key) ? k->value : NULL;
577	}
578
579	/*
580	* is our key outside the subtree we're in?
581	*
582	* If we're inside, all bits above the nib will be identical; note
583	* that shift points at the nib's lower rather than upper edge, so it
584	* needs to be masked away as well.
585	*/
586	if ((key ^ n->path) >> (n->shift) & ~NIB) {
587	/*
588	* subtree is too far to the left?
589	* -> its rightmost value is good
590	*/
591	if (n->path < key)
592	return find_successor(n);
593
594	/*
595	* subtree is too far to the right?
596	* -> it has nothing of interest to us
597	*/
598	return NULL;
599	}
600
601	unsigned nib = slice_index(key, n->shift);
602	/* recursive call: follow the path */
603	{
604	struct critnib_node *m;
605	load(&n->child[nib], &m);
606	void *value = find_le(m, key);
607	if (value)
608	return value;
609	}
610
611	/*
612	* nothing in that subtree? We strayed from the path at this point,
613	* thus need to search every subtree to our left in this node. No
614	* need to dive into any but the first non-null, though.
615	*/
616	for (; nib > 0; nib--) {
617	struct critnib_node *m;
618	load(&n->child[nib - 1], &m);
619	if (m) {
620	n = m;
621	if (is_leaf(n))
622	return to_leaf(n)->value;
623
624	return find_successor(n);
625	}
626	}
627
628	return NULL;
629	}
630
631	/*
632	* critnib_find_le -- query for a key ("<=" match), returns value or NULL
633	*
634	* Same guarantees as critnib_get().
635	*/
636	void *
637	critnib_find_le(struct critnib *c, uint64_t key)
638	{
639	uint64_t wrs1, wrs2;
640	void *res;
641
642	do {
643	load(&c->remove_count, &wrs1);
644	struct critnib_node n; / avoid a subtle TOCTOU */
645	load(&c->root, &n);
646	res = n ? find_le(n, key) : NULL;
647	load(&c->remove_count, &wrs2);
648	} while (wrs1 + DELETED_LIFE <= wrs2);
649
650	return res;
651	}