]>
Commit | Line | Data |
---|---|---|
ca577779 PD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * This file and its contents are supplied under the terms of the | |
5 | * Common Development and Distribution License ("CDDL"), version 1.0. | |
6 | * You may only use this file in accordance with the terms of version | |
7 | * 1.0 of the CDDL. | |
8 | * | |
9 | * A full copy of the text of the CDDL should have accompanied this | |
10 | * source. A copy of the CDDL is also available via the Internet at | |
11 | * http://www.illumos.org/license/CDDL. | |
12 | * | |
13 | * CDDL HEADER END | |
14 | */ | |
15 | /* | |
16 | * Copyright (c) 2019 by Delphix. All rights reserved. | |
17 | */ | |
18 | ||
19 | #include <sys/btree.h> | |
20 | #include <sys/bitops.h> | |
21 | #include <sys/zfs_context.h> | |
22 | ||
23 | kmem_cache_t *zfs_btree_leaf_cache; | |
24 | ||
25 | /* | |
26 | * Control the extent of the verification that occurs when zfs_btree_verify is | |
27 | * called. Primarily used for debugging when extending the btree logic and | |
28 | * functionality. As the intensity is increased, new verification steps are | |
29 | * added. These steps are cumulative; intensity = 3 includes the intensity = 1 | |
30 | * and intensity = 2 steps as well. | |
31 | * | |
32 | * Intensity 1: Verify that the tree's height is consistent throughout. | |
33 | * Intensity 2: Verify that a core node's children's parent pointers point | |
34 | * to the core node. | |
35 | * Intensity 3: Verify that the total number of elements in the tree matches the | |
36 | * sum of the number of elements in each node. Also verifies that each node's | |
37 | * count obeys the invariants (less than or equal to maximum value, greater than | |
38 | * or equal to half the maximum minus one). | |
39 | * Intensity 4: Verify that each element compares less than the element | |
40 | * immediately after it and greater than the one immediately before it using the | |
41 | * comparator function. For core nodes, also checks that each element is greater | |
42 | * than the last element in the first of the two nodes it separates, and less | |
43 | * than the first element in the second of the two nodes. | |
44 | * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside | |
45 | * of each node is poisoned appropriately. Note that poisoning always occurs if | |
46 | * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal | |
47 | * operation. | |
48 | * | |
49 | * Intensity 4 and 5 are particularly expensive to perform; the previous levels | |
50 | * are a few memory operations per node, while these levels require multiple | |
51 | * operations per element. In addition, when creating large btrees, these | |
52 | * operations are called at every step, resulting in extremely slow operation | |
53 | * (while the asymptotic complexity of the other steps is the same, the | |
54 | * importance of the constant factors cannot be denied). | |
55 | */ | |
b24d1c77 | 56 | uint_t zfs_btree_verify_intensity = 0; |
ca577779 PD |
57 | |
58 | /* | |
c0bf952c AM |
59 | * Convenience functions to silence warnings from memcpy/memmove's |
60 | * return values and change argument order to src, dest. | |
ca577779 | 61 | */ |
c0bf952c AM |
62 | static void |
63 | bcpy(const void *src, void *dest, size_t size) | |
64 | { | |
65 | (void) memcpy(dest, src, size); | |
66 | } | |
67 | ||
47d57dbc | 68 | static void |
ca577779 PD |
69 | bmov(const void *src, void *dest, size_t size) |
70 | { | |
71 | (void) memmove(dest, src, size); | |
72 | } | |
73 | ||
c0bf952c AM |
74 | static boolean_t |
75 | zfs_btree_is_core(struct zfs_btree_hdr *hdr) | |
76 | { | |
77 | return (hdr->bth_first == -1); | |
78 | } | |
79 | ||
ca577779 PD |
80 | #ifdef _ILP32 |
81 | #define BTREE_POISON 0xabadb10c | |
82 | #else | |
83 | #define BTREE_POISON 0xabadb10cdeadbeef | |
84 | #endif | |
85 | ||
86 | static void | |
87 | zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) | |
88 | { | |
89 | #ifdef ZFS_DEBUG | |
90 | size_t size = tree->bt_elem_size; | |
c0bf952c | 91 | if (zfs_btree_is_core(hdr)) { |
ca577779 | 92 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; |
c0bf952c AM |
93 | for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; |
94 | i++) { | |
ca577779 PD |
95 | node->btc_children[i] = |
96 | (zfs_btree_hdr_t *)BTREE_POISON; | |
97 | } | |
98 | (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, | |
99 | (BTREE_CORE_ELEMS - hdr->bth_count) * size); | |
c0bf952c AM |
100 | } else { |
101 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; | |
102 | (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); | |
103 | (void) memset(leaf->btl_elems + | |
104 | (hdr->bth_first + hdr->bth_count) * size, 0x0f, | |
9dcdee78 | 105 | tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - |
c0bf952c | 106 | (hdr->bth_first + hdr->bth_count) * size); |
ca577779 PD |
107 | } |
108 | #endif | |
109 | } | |
110 | ||
111 | static inline void | |
112 | zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, | |
c0bf952c | 113 | uint32_t idx, uint32_t count) |
ca577779 PD |
114 | { |
115 | #ifdef ZFS_DEBUG | |
116 | size_t size = tree->bt_elem_size; | |
c0bf952c AM |
117 | if (zfs_btree_is_core(hdr)) { |
118 | ASSERT3U(idx, >=, hdr->bth_count); | |
119 | ASSERT3U(idx, <=, BTREE_CORE_ELEMS); | |
120 | ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); | |
ca577779 | 121 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; |
c0bf952c AM |
122 | for (uint32_t i = 1; i <= count; i++) { |
123 | node->btc_children[idx + i] = | |
124 | (zfs_btree_hdr_t *)BTREE_POISON; | |
125 | } | |
126 | (void) memset(node->btc_elems + idx * size, 0x0f, count * size); | |
127 | } else { | |
128 | ASSERT3U(idx, <=, tree->bt_leaf_cap); | |
129 | ASSERT3U(idx + count, <=, tree->bt_leaf_cap); | |
130 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; | |
131 | (void) memset(leaf->btl_elems + | |
132 | (hdr->bth_first + idx) * size, 0x0f, count * size); | |
ca577779 PD |
133 | } |
134 | #endif | |
135 | } | |
136 | ||
137 | static inline void | |
138 | zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, | |
c0bf952c | 139 | uint32_t idx) |
ca577779 PD |
140 | { |
141 | #ifdef ZFS_DEBUG | |
142 | size_t size = tree->bt_elem_size; | |
c0bf952c AM |
143 | if (zfs_btree_is_core(hdr)) { |
144 | ASSERT3U(idx, <, BTREE_CORE_ELEMS); | |
ca577779 PD |
145 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; |
146 | zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; | |
c0bf952c AM |
147 | VERIFY3P(node->btc_children[idx + 1], ==, cval); |
148 | for (size_t i = 0; i < size; i++) | |
149 | VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); | |
ca577779 | 150 | } else { |
c0bf952c | 151 | ASSERT3U(idx, <, tree->bt_leaf_cap); |
ca577779 | 152 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; |
c0bf952c AM |
153 | if (idx >= tree->bt_leaf_cap - hdr->bth_first) |
154 | return; | |
155 | for (size_t i = 0; i < size; i++) { | |
156 | VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) | |
157 | * size + i], ==, 0x0f); | |
158 | } | |
ca577779 PD |
159 | } |
160 | #endif | |
161 | } | |
162 | ||
163 | void | |
164 | zfs_btree_init(void) | |
165 | { | |
166 | zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", | |
c0bf952c | 167 | BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); |
ca577779 PD |
168 | } |
169 | ||
170 | void | |
171 | zfs_btree_fini(void) | |
172 | { | |
173 | kmem_cache_destroy(zfs_btree_leaf_cache); | |
174 | } | |
175 | ||
9dcdee78 AM |
176 | static void * |
177 | zfs_btree_leaf_alloc(zfs_btree_t *tree) | |
178 | { | |
179 | if (tree->bt_leaf_size == BTREE_LEAF_SIZE) | |
180 | return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); | |
181 | else | |
182 | return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); | |
183 | } | |
184 | ||
185 | static void | |
186 | zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) | |
187 | { | |
188 | if (tree->bt_leaf_size == BTREE_LEAF_SIZE) | |
189 | return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); | |
190 | else | |
191 | return (kmem_free(ptr, tree->bt_leaf_size)); | |
192 | } | |
193 | ||
ca577779 PD |
194 | void |
195 | zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), | |
677c6f84 | 196 | bt_find_in_buf_f bt_find_in_buf, size_t size) |
ca577779 | 197 | { |
677c6f84 RY |
198 | zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, |
199 | BTREE_LEAF_SIZE); | |
9dcdee78 AM |
200 | } |
201 | ||
677c6f84 RY |
202 | static void * |
203 | zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, | |
204 | const void *value, zfs_btree_index_t *where); | |
205 | ||
9dcdee78 AM |
206 | void |
207 | zfs_btree_create_custom(zfs_btree_t *tree, | |
208 | int (*compar) (const void *, const void *), | |
677c6f84 | 209 | bt_find_in_buf_f bt_find_in_buf, |
9dcdee78 AM |
210 | size_t size, size_t lsize) |
211 | { | |
212 | size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); | |
ca577779 | 213 | |
9dcdee78 | 214 | ASSERT3U(size, <=, esize / 2); |
861166b0 | 215 | memset(tree, 0, sizeof (*tree)); |
ca577779 | 216 | tree->bt_compar = compar; |
677c6f84 RY |
217 | tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? |
218 | zfs_btree_find_in_buf : bt_find_in_buf; | |
ca577779 | 219 | tree->bt_elem_size = size; |
9dcdee78 AM |
220 | tree->bt_leaf_size = lsize; |
221 | tree->bt_leaf_cap = P2ALIGN(esize / size, 2); | |
ca577779 PD |
222 | tree->bt_height = -1; |
223 | tree->bt_bulk = NULL; | |
224 | } | |
225 | ||
226 | /* | |
227 | * Find value in the array of elements provided. Uses a simple binary search. | |
228 | */ | |
229 | static void * | |
c0bf952c | 230 | zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, |
ca577779 PD |
231 | const void *value, zfs_btree_index_t *where) |
232 | { | |
c0bf952c AM |
233 | uint32_t max = nelems; |
234 | uint32_t min = 0; | |
ca577779 | 235 | while (max > min) { |
c0bf952c | 236 | uint32_t idx = (min + max) / 2; |
ca577779 PD |
237 | uint8_t *cur = buf + idx * tree->bt_elem_size; |
238 | int comp = tree->bt_compar(cur, value); | |
c0bf952c | 239 | if (comp < 0) { |
ca577779 | 240 | min = idx + 1; |
c0bf952c | 241 | } else if (comp > 0) { |
ca577779 PD |
242 | max = idx; |
243 | } else { | |
ca577779 PD |
244 | where->bti_offset = idx; |
245 | where->bti_before = B_FALSE; | |
246 | return (cur); | |
247 | } | |
248 | } | |
249 | ||
250 | where->bti_offset = max; | |
251 | where->bti_before = B_TRUE; | |
252 | return (NULL); | |
253 | } | |
254 | ||
255 | /* | |
256 | * Find the given value in the tree. where may be passed as null to use as a | |
257 | * membership test or if the btree is being used as a map. | |
258 | */ | |
259 | void * | |
260 | zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) | |
261 | { | |
262 | if (tree->bt_height == -1) { | |
263 | if (where != NULL) { | |
264 | where->bti_node = NULL; | |
265 | where->bti_offset = 0; | |
266 | } | |
267 | ASSERT0(tree->bt_num_elems); | |
268 | return (NULL); | |
269 | } | |
270 | ||
271 | /* | |
272 | * If we're in bulk-insert mode, we check the last spot in the tree | |
273 | * and the last leaf in the tree before doing the normal search, | |
274 | * because for most workloads the vast majority of finds in | |
275 | * bulk-insert mode are to insert new elements. | |
276 | */ | |
277 | zfs_btree_index_t idx; | |
c0bf952c | 278 | size_t size = tree->bt_elem_size; |
ca577779 PD |
279 | if (tree->bt_bulk != NULL) { |
280 | zfs_btree_leaf_t *last_leaf = tree->bt_bulk; | |
c0bf952c AM |
281 | int comp = tree->bt_compar(last_leaf->btl_elems + |
282 | (last_leaf->btl_hdr.bth_first + | |
283 | last_leaf->btl_hdr.bth_count - 1) * size, value); | |
284 | if (comp < 0) { | |
ca577779 PD |
285 | /* |
286 | * If what they're looking for is after the last | |
287 | * element, it's not in the tree. | |
288 | */ | |
289 | if (where != NULL) { | |
290 | where->bti_node = (zfs_btree_hdr_t *)last_leaf; | |
291 | where->bti_offset = | |
292 | last_leaf->btl_hdr.bth_count; | |
293 | where->bti_before = B_TRUE; | |
294 | } | |
295 | return (NULL); | |
c0bf952c | 296 | } else if (comp == 0) { |
ca577779 PD |
297 | if (where != NULL) { |
298 | where->bti_node = (zfs_btree_hdr_t *)last_leaf; | |
299 | where->bti_offset = | |
300 | last_leaf->btl_hdr.bth_count - 1; | |
301 | where->bti_before = B_FALSE; | |
302 | } | |
303 | return (last_leaf->btl_elems + | |
c0bf952c AM |
304 | (last_leaf->btl_hdr.bth_first + |
305 | last_leaf->btl_hdr.bth_count - 1) * size); | |
ca577779 | 306 | } |
c0bf952c AM |
307 | if (tree->bt_compar(last_leaf->btl_elems + |
308 | last_leaf->btl_hdr.bth_first * size, value) <= 0) { | |
ca577779 PD |
309 | /* |
310 | * If what they're looking for is after the first | |
311 | * element in the last leaf, it's in the last leaf or | |
312 | * it's not in the tree. | |
313 | */ | |
677c6f84 | 314 | void *d = tree->bt_find_in_buf(tree, |
c0bf952c AM |
315 | last_leaf->btl_elems + |
316 | last_leaf->btl_hdr.bth_first * size, | |
317 | last_leaf->btl_hdr.bth_count, value, &idx); | |
ca577779 PD |
318 | |
319 | if (where != NULL) { | |
320 | idx.bti_node = (zfs_btree_hdr_t *)last_leaf; | |
321 | *where = idx; | |
322 | } | |
323 | return (d); | |
324 | } | |
325 | } | |
326 | ||
327 | zfs_btree_core_t *node = NULL; | |
c0bf952c | 328 | uint32_t child = 0; |
9dcdee78 | 329 | uint32_t depth = 0; |
ca577779 PD |
330 | |
331 | /* | |
332 | * Iterate down the tree, finding which child the value should be in | |
333 | * by comparing with the separators. | |
334 | */ | |
335 | for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; | |
336 | node = (zfs_btree_core_t *)node->btc_children[child], depth++) { | |
337 | ASSERT3P(node, !=, NULL); | |
677c6f84 | 338 | void *d = tree->bt_find_in_buf(tree, node->btc_elems, |
ca577779 PD |
339 | node->btc_hdr.bth_count, value, &idx); |
340 | EQUIV(d != NULL, !idx.bti_before); | |
341 | if (d != NULL) { | |
342 | if (where != NULL) { | |
343 | idx.bti_node = (zfs_btree_hdr_t *)node; | |
344 | *where = idx; | |
345 | } | |
346 | return (d); | |
347 | } | |
348 | ASSERT(idx.bti_before); | |
349 | child = idx.bti_offset; | |
350 | } | |
351 | ||
352 | /* | |
353 | * The value is in this leaf, or it would be if it were in the | |
354 | * tree. Find its proper location and return it. | |
355 | */ | |
356 | zfs_btree_leaf_t *leaf = (depth == 0 ? | |
357 | (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); | |
677c6f84 | 358 | void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + |
c0bf952c | 359 | leaf->btl_hdr.bth_first * size, |
ca577779 PD |
360 | leaf->btl_hdr.bth_count, value, &idx); |
361 | ||
362 | if (where != NULL) { | |
363 | idx.bti_node = (zfs_btree_hdr_t *)leaf; | |
364 | *where = idx; | |
365 | } | |
366 | ||
367 | return (d); | |
368 | } | |
369 | ||
370 | /* | |
371 | * To explain the following functions, it is useful to understand the four | |
372 | * kinds of shifts used in btree operation. First, a shift is a movement of | |
373 | * elements within a node. It is used to create gaps for inserting new | |
374 | * elements and children, or cover gaps created when things are removed. A | |
375 | * shift has two fundamental properties, each of which can be one of two | |
376 | * values, making four types of shifts. There is the direction of the shift | |
377 | * (left or right) and the shape of the shift (parallelogram or isoceles | |
378 | * trapezoid (shortened to trapezoid hereafter)). The shape distinction only | |
379 | * applies to shifts of core nodes. | |
380 | * | |
381 | * The names derive from the following imagining of the layout of a node: | |
382 | * | |
383 | * Elements: * * * * * * * ... * * * | |
384 | * Children: * * * * * * * * ... * * * | |
385 | * | |
386 | * This layout follows from the fact that the elements act as separators | |
387 | * between pairs of children, and that children root subtrees "below" the | |
388 | * current node. A left and right shift are fairly self-explanatory; a left | |
389 | * shift moves things to the left, while a right shift moves things to the | |
390 | * right. A parallelogram shift is a shift with the same number of elements | |
391 | * and children being moved, while a trapezoid shift is a shift that moves one | |
392 | * more children than elements. An example follows: | |
393 | * | |
394 | * A parallelogram shift could contain the following: | |
395 | * _______________ | |
396 | * \* * * * \ * * * ... * * * | |
397 | * * \ * * * *\ * * * ... * * * | |
398 | * --------------- | |
399 | * A trapezoid shift could contain the following: | |
400 | * ___________ | |
401 | * * / * * * \ * * * ... * * * | |
402 | * * / * * * *\ * * * ... * * * | |
403 | * --------------- | |
404 | * | |
dd4bc569 | 405 | * Note that a parallelogram shift is always shaped like a "left-leaning" |
ca577779 PD |
406 | * parallelogram, where the starting index of the children being moved is |
407 | * always one higher than the starting index of the elements being moved. No | |
408 | * "right-leaning" parallelogram shifts are needed (shifts where the starting | |
409 | * element index and starting child index being moved are the same) to achieve | |
410 | * any btree operations, so we ignore them. | |
411 | */ | |
412 | ||
413 | enum bt_shift_shape { | |
414 | BSS_TRAPEZOID, | |
415 | BSS_PARALLELOGRAM | |
416 | }; | |
417 | ||
418 | enum bt_shift_direction { | |
419 | BSD_LEFT, | |
420 | BSD_RIGHT | |
421 | }; | |
422 | ||
423 | /* | |
424 | * Shift elements and children in the provided core node by off spots. The | |
425 | * first element moved is idx, and count elements are moved. The shape of the | |
426 | * shift is determined by shape. The direction is determined by dir. | |
427 | */ | |
428 | static inline void | |
c0bf952c AM |
429 | bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, |
430 | uint32_t count, uint32_t off, enum bt_shift_shape shape, | |
ca577779 PD |
431 | enum bt_shift_direction dir) |
432 | { | |
433 | size_t size = tree->bt_elem_size; | |
c0bf952c | 434 | ASSERT(zfs_btree_is_core(&node->btc_hdr)); |
ca577779 PD |
435 | |
436 | uint8_t *e_start = node->btc_elems + idx * size; | |
c0bf952c AM |
437 | uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : |
438 | e_start + off * size); | |
439 | bmov(e_start, e_out, count * size); | |
ca577779 PD |
440 | |
441 | zfs_btree_hdr_t **c_start = node->btc_children + idx + | |
442 | (shape == BSS_TRAPEZOID ? 0 : 1); | |
443 | zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : | |
444 | c_start + off); | |
c0bf952c | 445 | uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); |
ca577779 PD |
446 | bmov(c_start, c_out, c_count * sizeof (*c_start)); |
447 | } | |
448 | ||
449 | /* | |
450 | * Shift elements and children in the provided core node left by one spot. | |
451 | * The first element moved is idx, and count elements are moved. The | |
452 | * shape of the shift is determined by trap; true if the shift is a trapezoid, | |
453 | * false if it is a parallelogram. | |
454 | */ | |
455 | static inline void | |
c0bf952c AM |
456 | bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, |
457 | uint32_t count, enum bt_shift_shape shape) | |
ca577779 PD |
458 | { |
459 | bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); | |
460 | } | |
461 | ||
462 | /* | |
463 | * Shift elements and children in the provided core node right by one spot. | |
464 | * Starts with elements[idx] and children[idx] and one more child than element. | |
465 | */ | |
466 | static inline void | |
c0bf952c AM |
467 | bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, |
468 | uint32_t count, enum bt_shift_shape shape) | |
ca577779 PD |
469 | { |
470 | bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); | |
471 | } | |
472 | ||
473 | /* | |
474 | * Shift elements and children in the provided leaf node by off spots. | |
475 | * The first element moved is idx, and count elements are moved. The direction | |
476 | * is determined by left. | |
477 | */ | |
478 | static inline void | |
c0bf952c AM |
479 | bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, |
480 | uint32_t count, uint32_t off, enum bt_shift_direction dir) | |
ca577779 PD |
481 | { |
482 | size_t size = tree->bt_elem_size; | |
c0bf952c AM |
483 | zfs_btree_hdr_t *hdr = &node->btl_hdr; |
484 | ASSERT(!zfs_btree_is_core(hdr)); | |
ca577779 | 485 | |
c0bf952c AM |
486 | if (count == 0) |
487 | return; | |
488 | uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; | |
489 | uint8_t *out = (dir == BSD_LEFT ? start - off * size : | |
490 | start + off * size); | |
ca577779 PD |
491 | bmov(start, out, count * size); |
492 | } | |
493 | ||
c0bf952c AM |
494 | /* |
495 | * Grow leaf for n new elements before idx. | |
496 | */ | |
497 | static void | |
498 | bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, | |
499 | uint32_t n) | |
ca577779 | 500 | { |
c0bf952c AM |
501 | zfs_btree_hdr_t *hdr = &leaf->btl_hdr; |
502 | ASSERT(!zfs_btree_is_core(hdr)); | |
503 | ASSERT3U(idx, <=, hdr->bth_count); | |
504 | uint32_t capacity = tree->bt_leaf_cap; | |
505 | ASSERT3U(hdr->bth_count + n, <=, capacity); | |
506 | boolean_t cl = (hdr->bth_first >= n); | |
507 | boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); | |
508 | ||
509 | if (cl && (!cr || idx <= hdr->bth_count / 2)) { | |
510 | /* Grow left. */ | |
511 | hdr->bth_first -= n; | |
512 | bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); | |
513 | } else if (cr) { | |
514 | /* Grow right. */ | |
515 | bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, | |
516 | BSD_RIGHT); | |
517 | } else { | |
518 | /* Grow both ways. */ | |
519 | uint32_t fn = hdr->bth_first - | |
520 | (capacity - (hdr->bth_count + n)) / 2; | |
521 | hdr->bth_first -= fn; | |
522 | bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); | |
523 | bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, | |
524 | n - fn, BSD_RIGHT); | |
525 | } | |
526 | hdr->bth_count += n; | |
ca577779 PD |
527 | } |
528 | ||
c0bf952c AM |
529 | /* |
530 | * Shrink leaf for count elements starting from idx. | |
531 | */ | |
532 | static void | |
533 | bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, | |
534 | uint32_t n) | |
ca577779 | 535 | { |
c0bf952c AM |
536 | zfs_btree_hdr_t *hdr = &leaf->btl_hdr; |
537 | ASSERT(!zfs_btree_is_core(hdr)); | |
538 | ASSERT3U(idx, <=, hdr->bth_count); | |
539 | ASSERT3U(idx + n, <=, hdr->bth_count); | |
540 | ||
541 | if (idx <= (hdr->bth_count - n) / 2) { | |
542 | bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); | |
543 | zfs_btree_poison_node_at(tree, hdr, 0, n); | |
544 | hdr->bth_first += n; | |
545 | } else { | |
546 | bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, | |
547 | BSD_LEFT); | |
548 | zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); | |
549 | } | |
550 | hdr->bth_count -= n; | |
ca577779 PD |
551 | } |
552 | ||
553 | /* | |
554 | * Move children and elements from one core node to another. The shape | |
555 | * parameter behaves the same as it does in the shift logic. | |
556 | */ | |
557 | static inline void | |
c0bf952c AM |
558 | bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, |
559 | uint32_t count, zfs_btree_core_t *dest, uint32_t didx, | |
ca577779 PD |
560 | enum bt_shift_shape shape) |
561 | { | |
562 | size_t size = tree->bt_elem_size; | |
c0bf952c AM |
563 | ASSERT(zfs_btree_is_core(&source->btc_hdr)); |
564 | ASSERT(zfs_btree_is_core(&dest->btc_hdr)); | |
ca577779 | 565 | |
c0bf952c | 566 | bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, |
ca577779 PD |
567 | count * size); |
568 | ||
c0bf952c AM |
569 | uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); |
570 | bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), | |
ca577779 PD |
571 | dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), |
572 | c_count * sizeof (*source->btc_children)); | |
573 | } | |
574 | ||
575 | static inline void | |
c0bf952c AM |
576 | bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, |
577 | uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) | |
ca577779 PD |
578 | { |
579 | size_t size = tree->bt_elem_size; | |
c0bf952c AM |
580 | ASSERT(!zfs_btree_is_core(&source->btl_hdr)); |
581 | ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); | |
ca577779 | 582 | |
c0bf952c AM |
583 | bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, |
584 | dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, | |
ca577779 PD |
585 | count * size); |
586 | } | |
587 | ||
588 | /* | |
589 | * Find the first element in the subtree rooted at hdr, return its value and | |
590 | * put its location in where if non-null. | |
591 | */ | |
592 | static void * | |
c0bf952c AM |
593 | zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, |
594 | zfs_btree_index_t *where) | |
ca577779 PD |
595 | { |
596 | zfs_btree_hdr_t *node; | |
597 | ||
c0bf952c AM |
598 | for (node = hdr; zfs_btree_is_core(node); |
599 | node = ((zfs_btree_core_t *)node)->btc_children[0]) | |
ca577779 PD |
600 | ; |
601 | ||
c0bf952c | 602 | ASSERT(!zfs_btree_is_core(node)); |
ca577779 PD |
603 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; |
604 | if (where != NULL) { | |
605 | where->bti_node = node; | |
606 | where->bti_offset = 0; | |
607 | where->bti_before = B_FALSE; | |
608 | } | |
c0bf952c | 609 | return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); |
ca577779 PD |
610 | } |
611 | ||
612 | /* Insert an element and a child into a core node at the given offset. */ | |
613 | static void | |
614 | zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, | |
c0bf952c | 615 | uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) |
ca577779 | 616 | { |
c0bf952c | 617 | size_t size = tree->bt_elem_size; |
ca577779 PD |
618 | zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; |
619 | ASSERT3P(par_hdr, ==, new_node->bth_parent); | |
620 | ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); | |
621 | ||
622 | if (zfs_btree_verify_intensity >= 5) { | |
623 | zfs_btree_verify_poison_at(tree, par_hdr, | |
624 | par_hdr->bth_count); | |
625 | } | |
626 | /* Shift existing elements and children */ | |
c0bf952c | 627 | uint32_t count = par_hdr->bth_count - offset; |
ca577779 PD |
628 | bt_shift_core_right(tree, parent, offset, count, |
629 | BSS_PARALLELOGRAM); | |
630 | ||
631 | /* Insert new values */ | |
632 | parent->btc_children[offset + 1] = new_node; | |
c0bf952c | 633 | bcpy(buf, parent->btc_elems + offset * size, size); |
ca577779 PD |
634 | par_hdr->bth_count++; |
635 | } | |
636 | ||
637 | /* | |
638 | * Insert new_node into the parent of old_node directly after old_node, with | |
639 | * buf as the dividing element between the two. | |
640 | */ | |
641 | static void | |
642 | zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, | |
643 | zfs_btree_hdr_t *new_node, void *buf) | |
644 | { | |
645 | ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); | |
c0bf952c | 646 | size_t size = tree->bt_elem_size; |
ca577779 | 647 | zfs_btree_core_t *parent = old_node->bth_parent; |
ca577779 PD |
648 | |
649 | /* | |
650 | * If this is the root node we were splitting, we create a new root | |
651 | * and increase the height of the tree. | |
652 | */ | |
653 | if (parent == NULL) { | |
654 | ASSERT3P(old_node, ==, tree->bt_root); | |
655 | tree->bt_num_nodes++; | |
656 | zfs_btree_core_t *new_root = | |
657 | kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * | |
658 | size, KM_SLEEP); | |
659 | zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; | |
660 | new_root_hdr->bth_parent = NULL; | |
c0bf952c | 661 | new_root_hdr->bth_first = -1; |
ca577779 PD |
662 | new_root_hdr->bth_count = 1; |
663 | ||
664 | old_node->bth_parent = new_node->bth_parent = new_root; | |
665 | new_root->btc_children[0] = old_node; | |
666 | new_root->btc_children[1] = new_node; | |
c0bf952c | 667 | bcpy(buf, new_root->btc_elems, size); |
ca577779 PD |
668 | |
669 | tree->bt_height++; | |
670 | tree->bt_root = new_root_hdr; | |
671 | zfs_btree_poison_node(tree, new_root_hdr); | |
672 | return; | |
673 | } | |
674 | ||
675 | /* | |
676 | * Since we have the new separator, binary search for where to put | |
677 | * new_node. | |
678 | */ | |
63652e15 | 679 | zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; |
ca577779 | 680 | zfs_btree_index_t idx; |
c0bf952c | 681 | ASSERT(zfs_btree_is_core(par_hdr)); |
677c6f84 | 682 | VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, |
ca577779 PD |
683 | par_hdr->bth_count, buf, &idx), ==, NULL); |
684 | ASSERT(idx.bti_before); | |
c0bf952c | 685 | uint32_t offset = idx.bti_offset; |
ca577779 PD |
686 | ASSERT3U(offset, <=, par_hdr->bth_count); |
687 | ASSERT3P(parent->btc_children[offset], ==, old_node); | |
688 | ||
689 | /* | |
dd4bc569 | 690 | * If the parent isn't full, shift things to accommodate our insertions |
ca577779 PD |
691 | * and return. |
692 | */ | |
693 | if (par_hdr->bth_count != BTREE_CORE_ELEMS) { | |
694 | zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); | |
695 | return; | |
696 | } | |
697 | ||
698 | /* | |
699 | * We need to split this core node into two. Currently there are | |
700 | * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for | |
701 | * BTREE_CORE_ELEMS + 2. Some of the children will be part of the | |
702 | * current node, and the others will be moved to the new core node. | |
703 | * There are BTREE_CORE_ELEMS + 1 elements including the new one. One | |
704 | * will be used as the new separator in our parent, and the others | |
705 | * will be split among the two core nodes. | |
706 | * | |
707 | * Usually we will split the node in half evenly, with | |
708 | * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we | |
709 | * instead move only about a quarter of the elements (and children) to | |
710 | * the new node. Since the average state after a long time is a 3/4 | |
711 | * full node, shortcutting directly to that state improves efficiency. | |
712 | * | |
713 | * We do this in two stages: first we split into two nodes, and then we | |
714 | * reuse our existing logic to insert the new element and child. | |
715 | */ | |
c0bf952c | 716 | uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? |
ca577779 | 717 | 2 : 4)) - 1, 2); |
c0bf952c | 718 | uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; |
ca577779 PD |
719 | ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); |
720 | tree->bt_num_nodes++; | |
721 | zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + | |
722 | BTREE_CORE_ELEMS * size, KM_SLEEP); | |
723 | zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; | |
724 | new_par_hdr->bth_parent = par_hdr->bth_parent; | |
c0bf952c | 725 | new_par_hdr->bth_first = -1; |
ca577779 PD |
726 | new_par_hdr->bth_count = move_count; |
727 | zfs_btree_poison_node(tree, new_par_hdr); | |
728 | ||
729 | par_hdr->bth_count = keep_count; | |
730 | ||
731 | bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, | |
732 | 0, BSS_TRAPEZOID); | |
733 | ||
734 | /* Store the new separator in a buffer. */ | |
735 | uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); | |
c0bf952c | 736 | bcpy(parent->btc_elems + keep_count * size, tmp_buf, |
ca577779 PD |
737 | size); |
738 | zfs_btree_poison_node(tree, par_hdr); | |
739 | ||
740 | if (offset < keep_count) { | |
741 | /* Insert the new node into the left half */ | |
742 | zfs_btree_insert_core_impl(tree, parent, offset, new_node, | |
743 | buf); | |
744 | ||
745 | /* | |
746 | * Move the new separator to the existing buffer. | |
747 | */ | |
c0bf952c | 748 | bcpy(tmp_buf, buf, size); |
ca577779 PD |
749 | } else if (offset > keep_count) { |
750 | /* Insert the new node into the right half */ | |
751 | new_node->bth_parent = new_parent; | |
752 | zfs_btree_insert_core_impl(tree, new_parent, | |
753 | offset - keep_count - 1, new_node, buf); | |
754 | ||
755 | /* | |
756 | * Move the new separator to the existing buffer. | |
757 | */ | |
c0bf952c | 758 | bcpy(tmp_buf, buf, size); |
ca577779 PD |
759 | } else { |
760 | /* | |
761 | * Move the new separator into the right half, and replace it | |
762 | * with buf. We also need to shift back the elements in the | |
dd4bc569 | 763 | * right half to accommodate new_node. |
ca577779 PD |
764 | */ |
765 | bt_shift_core_right(tree, new_parent, 0, move_count, | |
766 | BSS_TRAPEZOID); | |
767 | new_parent->btc_children[0] = new_node; | |
c0bf952c | 768 | bcpy(tmp_buf, new_parent->btc_elems, size); |
ca577779 PD |
769 | new_par_hdr->bth_count++; |
770 | } | |
771 | kmem_free(tmp_buf, size); | |
772 | zfs_btree_poison_node(tree, par_hdr); | |
773 | ||
c0bf952c | 774 | for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) |
ca577779 PD |
775 | new_parent->btc_children[i]->bth_parent = new_parent; |
776 | ||
c0bf952c | 777 | for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) |
ca577779 PD |
778 | ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); |
779 | ||
780 | /* | |
781 | * Now that the node is split, we need to insert the new node into its | |
782 | * parent. This may cause further splitting. | |
783 | */ | |
784 | zfs_btree_insert_into_parent(tree, &parent->btc_hdr, | |
785 | &new_parent->btc_hdr, buf); | |
786 | } | |
787 | ||
788 | /* Insert an element into a leaf node at the given offset. */ | |
789 | static void | |
790 | zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, | |
c0bf952c | 791 | uint32_t idx, const void *value) |
ca577779 | 792 | { |
c0bf952c | 793 | size_t size = tree->bt_elem_size; |
ca577779 | 794 | zfs_btree_hdr_t *hdr = &leaf->btl_hdr; |
c0bf952c | 795 | ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); |
ca577779 PD |
796 | |
797 | if (zfs_btree_verify_intensity >= 5) { | |
798 | zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, | |
799 | leaf->btl_hdr.bth_count); | |
800 | } | |
801 | ||
c0bf952c AM |
802 | bt_grow_leaf(tree, leaf, idx, 1); |
803 | uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; | |
804 | bcpy(value, start, size); | |
ca577779 PD |
805 | } |
806 | ||
c0bf952c AM |
807 | static void |
808 | zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); | |
809 | ||
ca577779 PD |
810 | /* Helper function for inserting a new value into leaf at the given index. */ |
811 | static void | |
812 | zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, | |
c0bf952c | 813 | const void *value, uint32_t idx) |
ca577779 | 814 | { |
c0bf952c AM |
815 | size_t size = tree->bt_elem_size; |
816 | uint32_t capacity = tree->bt_leaf_cap; | |
ca577779 PD |
817 | |
818 | /* | |
819 | * If the leaf isn't full, shift the elements after idx and insert | |
820 | * value. | |
821 | */ | |
822 | if (leaf->btl_hdr.bth_count != capacity) { | |
823 | zfs_btree_insert_leaf_impl(tree, leaf, idx, value); | |
824 | return; | |
825 | } | |
826 | ||
827 | /* | |
828 | * Otherwise, we split the leaf node into two nodes. If we're not bulk | |
829 | * inserting, each is of size (capacity / 2). If we are bulk | |
830 | * inserting, we move a quarter of the elements to the new node so | |
831 | * inserts into the old node don't cause immediate splitting but the | |
832 | * tree stays relatively dense. Since the average state after a long | |
833 | * time is a 3/4 full node, shortcutting directly to that state | |
834 | * improves efficiency. At the end of the bulk insertion process | |
835 | * we'll need to go through and fix up any nodes (the last leaf and | |
836 | * its ancestors, potentially) that are below the minimum. | |
837 | * | |
838 | * In either case, we're left with one extra element. The leftover | |
839 | * element will become the new dividing element between the two nodes. | |
840 | */ | |
c0bf952c AM |
841 | uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; |
842 | uint32_t keep_count = capacity - move_count - 1; | |
843 | ASSERT3U(keep_count, >=, 1); | |
844 | /* If we insert on left. move one more to keep leaves balanced. */ | |
845 | if (idx < keep_count) { | |
846 | keep_count--; | |
847 | move_count++; | |
848 | } | |
ca577779 | 849 | tree->bt_num_nodes++; |
9dcdee78 | 850 | zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); |
ca577779 PD |
851 | zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; |
852 | new_hdr->bth_parent = leaf->btl_hdr.bth_parent; | |
c0bf952c AM |
853 | new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + |
854 | (idx >= keep_count && idx <= keep_count + move_count / 2); | |
ca577779 PD |
855 | new_hdr->bth_count = move_count; |
856 | zfs_btree_poison_node(tree, new_hdr); | |
857 | ||
ca577779 PD |
858 | if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) |
859 | tree->bt_bulk = new_leaf; | |
860 | ||
861 | /* Copy the back part to the new leaf. */ | |
c0bf952c | 862 | bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); |
ca577779 PD |
863 | |
864 | /* We store the new separator in a buffer we control for simplicity. */ | |
865 | uint8_t *buf = kmem_alloc(size, KM_SLEEP); | |
c0bf952c AM |
866 | bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, |
867 | buf, size); | |
868 | ||
869 | bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); | |
ca577779 PD |
870 | |
871 | if (idx < keep_count) { | |
872 | /* Insert into the existing leaf. */ | |
873 | zfs_btree_insert_leaf_impl(tree, leaf, idx, value); | |
874 | } else if (idx > keep_count) { | |
875 | /* Insert into the new leaf. */ | |
876 | zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - | |
877 | 1, value); | |
878 | } else { | |
879 | /* | |
c0bf952c AM |
880 | * Insert planned separator into the new leaf, and use |
881 | * the new value as the new separator. | |
ca577779 | 882 | */ |
c0bf952c AM |
883 | zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); |
884 | bcpy(value, buf, size); | |
ca577779 PD |
885 | } |
886 | ||
887 | /* | |
888 | * Now that the node is split, we need to insert the new node into its | |
889 | * parent. This may cause further splitting, bur only of core nodes. | |
890 | */ | |
891 | zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, | |
892 | buf); | |
893 | kmem_free(buf, size); | |
894 | } | |
895 | ||
c0bf952c | 896 | static uint32_t |
ca577779 PD |
897 | zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) |
898 | { | |
899 | void *buf; | |
c0bf952c | 900 | if (zfs_btree_is_core(hdr)) { |
ca577779 PD |
901 | buf = ((zfs_btree_core_t *)hdr)->btc_elems; |
902 | } else { | |
c0bf952c AM |
903 | buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + |
904 | hdr->bth_first * tree->bt_elem_size; | |
ca577779 PD |
905 | } |
906 | zfs_btree_index_t idx; | |
907 | zfs_btree_core_t *parent = hdr->bth_parent; | |
677c6f84 | 908 | VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, |
ca577779 PD |
909 | parent->btc_hdr.bth_count, buf, &idx), ==, NULL); |
910 | ASSERT(idx.bti_before); | |
911 | ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); | |
912 | ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); | |
913 | return (idx.bti_offset); | |
914 | } | |
915 | ||
916 | /* | |
917 | * Take the b-tree out of bulk insert mode. During bulk-insert mode, some | |
918 | * nodes may violate the invariant that non-root nodes must be at least half | |
919 | * full. All nodes violating this invariant should be the last node in their | |
920 | * particular level. To correct the invariant, we take values from their left | |
921 | * neighbor until they are half full. They must have a left neighbor at their | |
922 | * level because the last node at a level is not the first node unless it's | |
923 | * the root. | |
924 | */ | |
925 | static void | |
926 | zfs_btree_bulk_finish(zfs_btree_t *tree) | |
927 | { | |
928 | ASSERT3P(tree->bt_bulk, !=, NULL); | |
929 | ASSERT3P(tree->bt_root, !=, NULL); | |
930 | zfs_btree_leaf_t *leaf = tree->bt_bulk; | |
931 | zfs_btree_hdr_t *hdr = &leaf->btl_hdr; | |
932 | zfs_btree_core_t *parent = hdr->bth_parent; | |
c0bf952c AM |
933 | size_t size = tree->bt_elem_size; |
934 | uint32_t capacity = tree->bt_leaf_cap; | |
ca577779 PD |
935 | |
936 | /* | |
937 | * The invariant doesn't apply to the root node, if that's the only | |
938 | * node in the tree we're done. | |
939 | */ | |
940 | if (parent == NULL) { | |
941 | tree->bt_bulk = NULL; | |
942 | return; | |
943 | } | |
944 | ||
945 | /* First, take elements to rebalance the leaf node. */ | |
946 | if (hdr->bth_count < capacity / 2) { | |
947 | /* | |
948 | * First, find the left neighbor. The simplest way to do this | |
949 | * is to call zfs_btree_prev twice; the first time finds some | |
950 | * ancestor of this node, and the second time finds the left | |
951 | * neighbor. The ancestor found is the lowest common ancestor | |
952 | * of leaf and the neighbor. | |
953 | */ | |
954 | zfs_btree_index_t idx = { | |
955 | .bti_node = hdr, | |
956 | .bti_offset = 0 | |
957 | }; | |
958 | VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); | |
c0bf952c | 959 | ASSERT(zfs_btree_is_core(idx.bti_node)); |
ca577779 | 960 | zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; |
c0bf952c | 961 | uint32_t common_idx = idx.bti_offset; |
ca577779 PD |
962 | |
963 | VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); | |
c0bf952c | 964 | ASSERT(!zfs_btree_is_core(idx.bti_node)); |
ca577779 PD |
965 | zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; |
966 | zfs_btree_hdr_t *l_hdr = idx.bti_node; | |
c0bf952c | 967 | uint32_t move_count = (capacity / 2) - hdr->bth_count; |
ca577779 PD |
968 | ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, |
969 | capacity / 2); | |
970 | ||
971 | if (zfs_btree_verify_intensity >= 5) { | |
c0bf952c | 972 | for (uint32_t i = 0; i < move_count; i++) { |
ca577779 PD |
973 | zfs_btree_verify_poison_at(tree, hdr, |
974 | leaf->btl_hdr.bth_count + i); | |
975 | } | |
976 | } | |
977 | ||
978 | /* First, shift elements in leaf back. */ | |
c0bf952c | 979 | bt_grow_leaf(tree, leaf, 0, move_count); |
ca577779 PD |
980 | |
981 | /* Next, move the separator from the common ancestor to leaf. */ | |
c0bf952c AM |
982 | uint8_t *separator = common->btc_elems + common_idx * size; |
983 | uint8_t *out = leaf->btl_elems + | |
984 | (hdr->bth_first + move_count - 1) * size; | |
985 | bcpy(separator, out, size); | |
ca577779 PD |
986 | |
987 | /* | |
988 | * Now we move elements from the tail of the left neighbor to | |
989 | * fill the remaining spots in leaf. | |
990 | */ | |
991 | bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - | |
c0bf952c | 992 | (move_count - 1), move_count - 1, leaf, 0); |
ca577779 PD |
993 | |
994 | /* | |
995 | * Finally, move the new last element in the left neighbor to | |
996 | * the separator. | |
997 | */ | |
c0bf952c AM |
998 | bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + |
999 | l_hdr->bth_count - move_count) * size, separator, size); | |
ca577779 PD |
1000 | |
1001 | /* Adjust the node's counts, and we're done. */ | |
c0bf952c AM |
1002 | bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, |
1003 | move_count); | |
ca577779 PD |
1004 | |
1005 | ASSERT3U(l_hdr->bth_count, >=, capacity / 2); | |
1006 | ASSERT3U(hdr->bth_count, >=, capacity / 2); | |
ca577779 PD |
1007 | } |
1008 | ||
1009 | /* | |
1010 | * Now we have to rebalance any ancestors of leaf that may also | |
1011 | * violate the invariant. | |
1012 | */ | |
1013 | capacity = BTREE_CORE_ELEMS; | |
1014 | while (parent->btc_hdr.bth_parent != NULL) { | |
1015 | zfs_btree_core_t *cur = parent; | |
1016 | zfs_btree_hdr_t *hdr = &cur->btc_hdr; | |
1017 | parent = hdr->bth_parent; | |
1018 | /* | |
1019 | * If the invariant isn't violated, move on to the next | |
1020 | * ancestor. | |
1021 | */ | |
1022 | if (hdr->bth_count >= capacity / 2) | |
1023 | continue; | |
1024 | ||
1025 | /* | |
1026 | * Because the smallest number of nodes we can move when | |
1027 | * splitting is 2, we never need to worry about not having a | |
1028 | * left sibling (a sibling is a neighbor with the same parent). | |
1029 | */ | |
c0bf952c | 1030 | uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); |
ca577779 PD |
1031 | ASSERT3U(parent_idx, >, 0); |
1032 | zfs_btree_core_t *l_neighbor = | |
1033 | (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; | |
c0bf952c | 1034 | uint32_t move_count = (capacity / 2) - hdr->bth_count; |
ca577779 PD |
1035 | ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, |
1036 | capacity / 2); | |
1037 | ||
1038 | if (zfs_btree_verify_intensity >= 5) { | |
c0bf952c | 1039 | for (uint32_t i = 0; i < move_count; i++) { |
ca577779 PD |
1040 | zfs_btree_verify_poison_at(tree, hdr, |
1041 | hdr->bth_count + i); | |
1042 | } | |
1043 | } | |
1044 | /* First, shift things in the right node back. */ | |
1045 | bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, | |
1046 | BSS_TRAPEZOID, BSD_RIGHT); | |
1047 | ||
1048 | /* Next, move the separator to the right node. */ | |
1049 | uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * | |
1050 | size); | |
1051 | uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); | |
c0bf952c | 1052 | bcpy(separator, e_out, size); |
ca577779 PD |
1053 | |
1054 | /* | |
1055 | * Now, move elements and children from the left node to the | |
1056 | * right. We move one more child than elements. | |
1057 | */ | |
1058 | move_count--; | |
c0bf952c | 1059 | uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; |
ca577779 PD |
1060 | bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, |
1061 | BSS_TRAPEZOID); | |
1062 | ||
1063 | /* | |
1064 | * Finally, move the last element in the left node to the | |
1065 | * separator's position. | |
1066 | */ | |
1067 | move_idx--; | |
c0bf952c | 1068 | bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); |
ca577779 PD |
1069 | |
1070 | l_neighbor->btc_hdr.bth_count -= move_count + 1; | |
1071 | hdr->bth_count += move_count + 1; | |
1072 | ||
1073 | ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); | |
1074 | ASSERT3U(hdr->bth_count, >=, capacity / 2); | |
1075 | ||
1076 | zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); | |
1077 | ||
c0bf952c | 1078 | for (uint32_t i = 0; i <= hdr->bth_count; i++) |
ca577779 PD |
1079 | cur->btc_children[i]->bth_parent = cur; |
1080 | } | |
1081 | ||
1082 | tree->bt_bulk = NULL; | |
c0bf952c | 1083 | zfs_btree_verify(tree); |
ca577779 PD |
1084 | } |
1085 | ||
1086 | /* | |
1087 | * Insert value into tree at the location specified by where. | |
1088 | */ | |
1089 | void | |
516a83f8 | 1090 | zfs_btree_add_idx(zfs_btree_t *tree, const void *value, |
ca577779 PD |
1091 | const zfs_btree_index_t *where) |
1092 | { | |
1093 | zfs_btree_index_t idx = {0}; | |
1094 | ||
1095 | /* If we're not inserting in the last leaf, end bulk insert mode. */ | |
1096 | if (tree->bt_bulk != NULL) { | |
1097 | if (where->bti_node != &tree->bt_bulk->btl_hdr) { | |
1098 | zfs_btree_bulk_finish(tree); | |
1099 | VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); | |
1100 | where = &idx; | |
1101 | } | |
1102 | } | |
1103 | ||
1104 | tree->bt_num_elems++; | |
1105 | /* | |
1106 | * If this is the first element in the tree, create a leaf root node | |
1107 | * and add the value to it. | |
1108 | */ | |
1109 | if (where->bti_node == NULL) { | |
1110 | ASSERT3U(tree->bt_num_elems, ==, 1); | |
1111 | ASSERT3S(tree->bt_height, ==, -1); | |
1112 | ASSERT3P(tree->bt_root, ==, NULL); | |
1113 | ASSERT0(where->bti_offset); | |
1114 | ||
1115 | tree->bt_num_nodes++; | |
9dcdee78 | 1116 | zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); |
ca577779 PD |
1117 | tree->bt_root = &leaf->btl_hdr; |
1118 | tree->bt_height++; | |
1119 | ||
1120 | zfs_btree_hdr_t *hdr = &leaf->btl_hdr; | |
1121 | hdr->bth_parent = NULL; | |
c0bf952c | 1122 | hdr->bth_first = 0; |
ca577779 PD |
1123 | hdr->bth_count = 0; |
1124 | zfs_btree_poison_node(tree, hdr); | |
1125 | ||
1126 | zfs_btree_insert_into_leaf(tree, leaf, value, 0); | |
1127 | tree->bt_bulk = leaf; | |
c0bf952c | 1128 | } else if (!zfs_btree_is_core(where->bti_node)) { |
ca577779 PD |
1129 | /* |
1130 | * If we're inserting into a leaf, go directly to the helper | |
1131 | * function. | |
1132 | */ | |
1133 | zfs_btree_insert_into_leaf(tree, | |
1134 | (zfs_btree_leaf_t *)where->bti_node, value, | |
1135 | where->bti_offset); | |
1136 | } else { | |
1137 | /* | |
1138 | * If we're inserting into a core node, we can't just shift | |
1139 | * the existing element in that slot in the same node without | |
1140 | * breaking our ordering invariants. Instead we place the new | |
1141 | * value in the node at that spot and then insert the old | |
1142 | * separator into the first slot in the subtree to the right. | |
1143 | */ | |
ca577779 PD |
1144 | zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; |
1145 | ||
1146 | /* | |
1147 | * We can ignore bti_before, because either way the value | |
1148 | * should end up in bti_offset. | |
1149 | */ | |
c0bf952c | 1150 | uint32_t off = where->bti_offset; |
ca577779 PD |
1151 | zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; |
1152 | size_t size = tree->bt_elem_size; | |
1153 | uint8_t *buf = kmem_alloc(size, KM_SLEEP); | |
c0bf952c AM |
1154 | bcpy(node->btc_elems + off * size, buf, size); |
1155 | bcpy(value, node->btc_elems + off * size, size); | |
ca577779 PD |
1156 | |
1157 | /* | |
1158 | * Find the first slot in the subtree to the right, insert | |
1159 | * there. | |
1160 | */ | |
1161 | zfs_btree_index_t new_idx; | |
c0bf952c AM |
1162 | VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, |
1163 | NULL); | |
ca577779 | 1164 | ASSERT0(new_idx.bti_offset); |
c0bf952c | 1165 | ASSERT(!zfs_btree_is_core(new_idx.bti_node)); |
ca577779 PD |
1166 | zfs_btree_insert_into_leaf(tree, |
1167 | (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); | |
1168 | kmem_free(buf, size); | |
1169 | } | |
1170 | zfs_btree_verify(tree); | |
1171 | } | |
1172 | ||
1173 | /* | |
1174 | * Return the first element in the tree, and put its location in where if | |
1175 | * non-null. | |
1176 | */ | |
1177 | void * | |
1178 | zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) | |
1179 | { | |
1180 | if (tree->bt_height == -1) { | |
1181 | ASSERT0(tree->bt_num_elems); | |
1182 | return (NULL); | |
1183 | } | |
c0bf952c | 1184 | return (zfs_btree_first_helper(tree, tree->bt_root, where)); |
ca577779 PD |
1185 | } |
1186 | ||
1187 | /* | |
1188 | * Find the last element in the subtree rooted at hdr, return its value and | |
1189 | * put its location in where if non-null. | |
1190 | */ | |
1191 | static void * | |
1192 | zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, | |
1193 | zfs_btree_index_t *where) | |
1194 | { | |
1195 | zfs_btree_hdr_t *node; | |
1196 | ||
c0bf952c | 1197 | for (node = hdr; zfs_btree_is_core(node); node = |
ca577779 PD |
1198 | ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) |
1199 | ; | |
1200 | ||
1201 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; | |
1202 | if (where != NULL) { | |
1203 | where->bti_node = node; | |
1204 | where->bti_offset = node->bth_count - 1; | |
1205 | where->bti_before = B_FALSE; | |
1206 | } | |
c0bf952c AM |
1207 | return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * |
1208 | btree->bt_elem_size); | |
ca577779 PD |
1209 | } |
1210 | ||
1211 | /* | |
1212 | * Return the last element in the tree, and put its location in where if | |
1213 | * non-null. | |
1214 | */ | |
1215 | void * | |
1216 | zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) | |
1217 | { | |
1218 | if (tree->bt_height == -1) { | |
1219 | ASSERT0(tree->bt_num_elems); | |
1220 | return (NULL); | |
1221 | } | |
1222 | return (zfs_btree_last_helper(tree, tree->bt_root, where)); | |
1223 | } | |
1224 | ||
1225 | /* | |
1226 | * This function contains the logic to find the next node in the tree. A | |
1227 | * helper function is used because there are multiple internal consumemrs of | |
1228 | * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each | |
1229 | * node after we've finished with it. | |
1230 | */ | |
1231 | static void * | |
1232 | zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, | |
1233 | zfs_btree_index_t *out_idx, | |
1234 | void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) | |
1235 | { | |
1236 | if (idx->bti_node == NULL) { | |
1237 | ASSERT3S(tree->bt_height, ==, -1); | |
1238 | return (NULL); | |
1239 | } | |
1240 | ||
c0bf952c AM |
1241 | uint32_t offset = idx->bti_offset; |
1242 | if (!zfs_btree_is_core(idx->bti_node)) { | |
ca577779 PD |
1243 | /* |
1244 | * When finding the next element of an element in a leaf, | |
1245 | * there are two cases. If the element isn't the last one in | |
1246 | * the leaf, in which case we just return the next element in | |
1247 | * the leaf. Otherwise, we need to traverse up our parents | |
1248 | * until we find one where our ancestor isn't the last child | |
1249 | * of its parent. Once we do, the next element is the | |
1250 | * separator after our ancestor in its parent. | |
1251 | */ | |
1252 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; | |
c0bf952c | 1253 | uint32_t new_off = offset + (idx->bti_before ? 0 : 1); |
ca577779 PD |
1254 | if (leaf->btl_hdr.bth_count > new_off) { |
1255 | out_idx->bti_node = &leaf->btl_hdr; | |
1256 | out_idx->bti_offset = new_off; | |
1257 | out_idx->bti_before = B_FALSE; | |
c0bf952c AM |
1258 | return (leaf->btl_elems + (leaf->btl_hdr.bth_first + |
1259 | new_off) * tree->bt_elem_size); | |
ca577779 PD |
1260 | } |
1261 | ||
1262 | zfs_btree_hdr_t *prev = &leaf->btl_hdr; | |
1263 | for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; | |
1264 | node != NULL; node = node->btc_hdr.bth_parent) { | |
1265 | zfs_btree_hdr_t *hdr = &node->btc_hdr; | |
c0bf952c AM |
1266 | ASSERT(zfs_btree_is_core(hdr)); |
1267 | uint32_t i = zfs_btree_find_parent_idx(tree, prev); | |
ca577779 PD |
1268 | if (done_func != NULL) |
1269 | done_func(tree, prev); | |
1270 | if (i == hdr->bth_count) { | |
1271 | prev = hdr; | |
1272 | continue; | |
1273 | } | |
1274 | out_idx->bti_node = hdr; | |
1275 | out_idx->bti_offset = i; | |
1276 | out_idx->bti_before = B_FALSE; | |
1277 | return (node->btc_elems + i * tree->bt_elem_size); | |
1278 | } | |
1279 | if (done_func != NULL) | |
1280 | done_func(tree, prev); | |
1281 | /* | |
1282 | * We've traversed all the way up and been at the end of the | |
1283 | * node every time, so this was the last element in the tree. | |
1284 | */ | |
1285 | return (NULL); | |
1286 | } | |
1287 | ||
1288 | /* If we were before an element in a core node, return that element. */ | |
c0bf952c | 1289 | ASSERT(zfs_btree_is_core(idx->bti_node)); |
ca577779 PD |
1290 | zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; |
1291 | if (idx->bti_before) { | |
1292 | out_idx->bti_before = B_FALSE; | |
1293 | return (node->btc_elems + offset * tree->bt_elem_size); | |
1294 | } | |
1295 | ||
1296 | /* | |
1297 | * The next element from one in a core node is the first element in | |
1298 | * the subtree just to the right of the separator. | |
1299 | */ | |
1300 | zfs_btree_hdr_t *child = node->btc_children[offset + 1]; | |
c0bf952c | 1301 | return (zfs_btree_first_helper(tree, child, out_idx)); |
ca577779 PD |
1302 | } |
1303 | ||
1304 | /* | |
1305 | * Return the next valued node in the tree. The same address can be safely | |
1306 | * passed for idx and out_idx. | |
1307 | */ | |
1308 | void * | |
1309 | zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, | |
1310 | zfs_btree_index_t *out_idx) | |
1311 | { | |
1312 | return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); | |
1313 | } | |
1314 | ||
1315 | /* | |
1316 | * Return the previous valued node in the tree. The same value can be safely | |
1317 | * passed for idx and out_idx. | |
1318 | */ | |
1319 | void * | |
1320 | zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, | |
1321 | zfs_btree_index_t *out_idx) | |
1322 | { | |
1323 | if (idx->bti_node == NULL) { | |
1324 | ASSERT3S(tree->bt_height, ==, -1); | |
1325 | return (NULL); | |
1326 | } | |
1327 | ||
c0bf952c AM |
1328 | uint32_t offset = idx->bti_offset; |
1329 | if (!zfs_btree_is_core(idx->bti_node)) { | |
ca577779 PD |
1330 | /* |
1331 | * When finding the previous element of an element in a leaf, | |
1332 | * there are two cases. If the element isn't the first one in | |
1333 | * the leaf, in which case we just return the previous element | |
1334 | * in the leaf. Otherwise, we need to traverse up our parents | |
1335 | * until we find one where our previous ancestor isn't the | |
1336 | * first child. Once we do, the previous element is the | |
1337 | * separator after our previous ancestor. | |
1338 | */ | |
1339 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; | |
1340 | if (offset != 0) { | |
1341 | out_idx->bti_node = &leaf->btl_hdr; | |
1342 | out_idx->bti_offset = offset - 1; | |
1343 | out_idx->bti_before = B_FALSE; | |
c0bf952c AM |
1344 | return (leaf->btl_elems + (leaf->btl_hdr.bth_first + |
1345 | offset - 1) * tree->bt_elem_size); | |
ca577779 PD |
1346 | } |
1347 | zfs_btree_hdr_t *prev = &leaf->btl_hdr; | |
1348 | for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; | |
1349 | node != NULL; node = node->btc_hdr.bth_parent) { | |
1350 | zfs_btree_hdr_t *hdr = &node->btc_hdr; | |
c0bf952c AM |
1351 | ASSERT(zfs_btree_is_core(hdr)); |
1352 | uint32_t i = zfs_btree_find_parent_idx(tree, prev); | |
ca577779 PD |
1353 | if (i == 0) { |
1354 | prev = hdr; | |
1355 | continue; | |
1356 | } | |
1357 | out_idx->bti_node = hdr; | |
1358 | out_idx->bti_offset = i - 1; | |
1359 | out_idx->bti_before = B_FALSE; | |
1360 | return (node->btc_elems + (i - 1) * tree->bt_elem_size); | |
1361 | } | |
1362 | /* | |
1363 | * We've traversed all the way up and been at the start of the | |
1364 | * node every time, so this was the first node in the tree. | |
1365 | */ | |
1366 | return (NULL); | |
1367 | } | |
1368 | ||
1369 | /* | |
1370 | * The previous element from one in a core node is the last element in | |
1371 | * the subtree just to the left of the separator. | |
1372 | */ | |
c0bf952c | 1373 | ASSERT(zfs_btree_is_core(idx->bti_node)); |
ca577779 PD |
1374 | zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; |
1375 | zfs_btree_hdr_t *child = node->btc_children[offset]; | |
1376 | return (zfs_btree_last_helper(tree, child, out_idx)); | |
1377 | } | |
1378 | ||
1379 | /* | |
1380 | * Get the value at the provided index in the tree. | |
1381 | * | |
1382 | * Note that the value returned from this function can be mutated, but only | |
1383 | * if it will not change the ordering of the element with respect to any other | |
1384 | * elements that could be in the tree. | |
1385 | */ | |
1386 | void * | |
1387 | zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) | |
1388 | { | |
1389 | ASSERT(!idx->bti_before); | |
c0bf952c AM |
1390 | size_t size = tree->bt_elem_size; |
1391 | if (!zfs_btree_is_core(idx->bti_node)) { | |
ca577779 | 1392 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; |
c0bf952c AM |
1393 | return (leaf->btl_elems + (leaf->btl_hdr.bth_first + |
1394 | idx->bti_offset) * size); | |
ca577779 | 1395 | } |
ca577779 | 1396 | zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; |
c0bf952c | 1397 | return (node->btc_elems + idx->bti_offset * size); |
ca577779 PD |
1398 | } |
1399 | ||
1400 | /* Add the given value to the tree. Must not already be in the tree. */ | |
1401 | void | |
1402 | zfs_btree_add(zfs_btree_t *tree, const void *node) | |
1403 | { | |
1404 | zfs_btree_index_t where = {0}; | |
1405 | VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); | |
516a83f8 | 1406 | zfs_btree_add_idx(tree, node, &where); |
ca577779 PD |
1407 | } |
1408 | ||
1409 | /* Helper function to free a tree node. */ | |
1410 | static void | |
1411 | zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) | |
1412 | { | |
1413 | tree->bt_num_nodes--; | |
c0bf952c | 1414 | if (!zfs_btree_is_core(node)) { |
9dcdee78 | 1415 | zfs_btree_leaf_free(tree, node); |
ca577779 PD |
1416 | } else { |
1417 | kmem_free(node, sizeof (zfs_btree_core_t) + | |
1418 | BTREE_CORE_ELEMS * tree->bt_elem_size); | |
1419 | } | |
1420 | } | |
1421 | ||
1422 | /* | |
1423 | * Remove the rm_hdr and the separator to its left from the parent node. The | |
1424 | * buffer that rm_hdr was stored in may already be freed, so its contents | |
1425 | * cannot be accessed. | |
1426 | */ | |
1427 | static void | |
1428 | zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, | |
1429 | zfs_btree_hdr_t *rm_hdr) | |
1430 | { | |
1431 | size_t size = tree->bt_elem_size; | |
c0bf952c | 1432 | uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; |
ca577779 PD |
1433 | zfs_btree_hdr_t *hdr = &node->btc_hdr; |
1434 | /* | |
1435 | * If the node is the root node and rm_hdr is one of two children, | |
1436 | * promote the other child to the root. | |
1437 | */ | |
1438 | if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { | |
1439 | ASSERT3U(hdr->bth_count, ==, 1); | |
1440 | ASSERT3P(tree->bt_root, ==, node); | |
1441 | ASSERT3P(node->btc_children[1], ==, rm_hdr); | |
1442 | tree->bt_root = node->btc_children[0]; | |
1443 | node->btc_children[0]->bth_parent = NULL; | |
1444 | zfs_btree_node_destroy(tree, hdr); | |
1445 | tree->bt_height--; | |
1446 | return; | |
1447 | } | |
1448 | ||
c0bf952c | 1449 | uint32_t idx; |
ca577779 PD |
1450 | for (idx = 0; idx <= hdr->bth_count; idx++) { |
1451 | if (node->btc_children[idx] == rm_hdr) | |
1452 | break; | |
1453 | } | |
1454 | ASSERT3U(idx, <=, hdr->bth_count); | |
1455 | ||
1456 | /* | |
1457 | * If the node is the root or it has more than the minimum number of | |
1458 | * children, just remove the child and separator, and return. | |
1459 | */ | |
1460 | if (hdr->bth_parent == NULL || | |
1461 | hdr->bth_count > min_count) { | |
1462 | /* | |
1463 | * Shift the element and children to the right of rm_hdr to | |
1464 | * the left by one spot. | |
1465 | */ | |
1466 | bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, | |
1467 | BSS_PARALLELOGRAM); | |
1468 | hdr->bth_count--; | |
c0bf952c | 1469 | zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); |
ca577779 PD |
1470 | return; |
1471 | } | |
1472 | ||
1473 | ASSERT3U(hdr->bth_count, ==, min_count); | |
1474 | ||
1475 | /* | |
1476 | * Now we try to take a node from a neighbor. We check left, then | |
1477 | * right. If the neighbor exists and has more than the minimum number | |
dd4bc569 | 1478 | * of elements, we move the separator between us and them to our |
ca577779 PD |
1479 | * node, move their closest element (last for left, first for right) |
1480 | * to the separator, and move their closest child to our node. Along | |
1481 | * the way we need to collapse the gap made by idx, and (for our right | |
1482 | * neighbor) the gap made by removing their first element and child. | |
1483 | * | |
1484 | * Note: this logic currently doesn't support taking from a neighbor | |
1485 | * that isn't a sibling (i.e. a neighbor with a different | |
1486 | * parent). This isn't critical functionality, but may be worth | |
1487 | * implementing in the future for completeness' sake. | |
1488 | */ | |
1489 | zfs_btree_core_t *parent = hdr->bth_parent; | |
c0bf952c | 1490 | uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); |
ca577779 PD |
1491 | |
1492 | zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : | |
1493 | parent->btc_children[parent_idx - 1]); | |
1494 | if (l_hdr != NULL && l_hdr->bth_count > min_count) { | |
1495 | /* We can take a node from the left neighbor. */ | |
c0bf952c | 1496 | ASSERT(zfs_btree_is_core(l_hdr)); |
ca577779 PD |
1497 | zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; |
1498 | ||
1499 | /* | |
1500 | * Start by shifting the elements and children in the current | |
1501 | * node to the right by one spot. | |
1502 | */ | |
1503 | bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); | |
1504 | ||
1505 | /* | |
1506 | * Move the separator between node and neighbor to the first | |
1507 | * element slot in the current node. | |
1508 | */ | |
1509 | uint8_t *separator = parent->btc_elems + (parent_idx - 1) * | |
1510 | size; | |
c0bf952c | 1511 | bcpy(separator, node->btc_elems, size); |
ca577779 PD |
1512 | |
1513 | /* Move the last child of neighbor to our first child slot. */ | |
c0bf952c AM |
1514 | node->btc_children[0] = |
1515 | neighbor->btc_children[l_hdr->bth_count]; | |
ca577779 PD |
1516 | node->btc_children[0]->bth_parent = node; |
1517 | ||
1518 | /* Move the last element of neighbor to the separator spot. */ | |
1519 | uint8_t *take_elem = neighbor->btc_elems + | |
1520 | (l_hdr->bth_count - 1) * size; | |
c0bf952c | 1521 | bcpy(take_elem, separator, size); |
ca577779 | 1522 | l_hdr->bth_count--; |
c0bf952c | 1523 | zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); |
ca577779 PD |
1524 | return; |
1525 | } | |
1526 | ||
1527 | zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? | |
1528 | NULL : parent->btc_children[parent_idx + 1]); | |
1529 | if (r_hdr != NULL && r_hdr->bth_count > min_count) { | |
1530 | /* We can take a node from the right neighbor. */ | |
c0bf952c | 1531 | ASSERT(zfs_btree_is_core(r_hdr)); |
ca577779 PD |
1532 | zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; |
1533 | ||
1534 | /* | |
1535 | * Shift elements in node left by one spot to overwrite rm_hdr | |
1536 | * and the separator before it. | |
1537 | */ | |
1538 | bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, | |
1539 | BSS_PARALLELOGRAM); | |
1540 | ||
1541 | /* | |
1542 | * Move the separator between node and neighbor to the last | |
1543 | * element spot in node. | |
1544 | */ | |
1545 | uint8_t *separator = parent->btc_elems + parent_idx * size; | |
c0bf952c | 1546 | bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, |
ca577779 PD |
1547 | size); |
1548 | ||
1549 | /* | |
1550 | * Move the first child of neighbor to the last child spot in | |
1551 | * node. | |
1552 | */ | |
c0bf952c | 1553 | node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; |
ca577779 PD |
1554 | node->btc_children[hdr->bth_count]->bth_parent = node; |
1555 | ||
1556 | /* Move the first element of neighbor to the separator spot. */ | |
1557 | uint8_t *take_elem = neighbor->btc_elems; | |
c0bf952c | 1558 | bcpy(take_elem, separator, size); |
ca577779 PD |
1559 | r_hdr->bth_count--; |
1560 | ||
1561 | /* | |
1562 | * Shift the elements and children of neighbor to cover the | |
1563 | * stolen elements. | |
1564 | */ | |
1565 | bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, | |
1566 | BSS_TRAPEZOID); | |
c0bf952c | 1567 | zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); |
ca577779 PD |
1568 | return; |
1569 | } | |
1570 | ||
1571 | /* | |
1572 | * In this case, neither of our neighbors can spare an element, so we | |
1573 | * need to merge with one of them. We prefer the left one, | |
b0099072 | 1574 | * arbitrarily. Move the separator into the leftmost merging node |
ca577779 PD |
1575 | * (which may be us or the left neighbor), and then move the right |
1576 | * merging node's elements. Once that's done, we go back and delete | |
1577 | * the element we're removing. Finally, go into the parent and delete | |
1578 | * the right merging node and the separator. This may cause further | |
1579 | * merging. | |
1580 | */ | |
1581 | zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; | |
c0bf952c | 1582 | uint32_t new_idx = idx; |
ca577779 PD |
1583 | if (l_hdr != NULL) { |
1584 | keep_hdr = l_hdr; | |
1585 | new_rm_hdr = hdr; | |
1586 | new_idx += keep_hdr->bth_count + 1; | |
1587 | } else { | |
1588 | ASSERT3P(r_hdr, !=, NULL); | |
1589 | keep_hdr = hdr; | |
1590 | new_rm_hdr = r_hdr; | |
1591 | parent_idx++; | |
1592 | } | |
1593 | ||
c0bf952c AM |
1594 | ASSERT(zfs_btree_is_core(keep_hdr)); |
1595 | ASSERT(zfs_btree_is_core(new_rm_hdr)); | |
ca577779 PD |
1596 | |
1597 | zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; | |
1598 | zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; | |
1599 | ||
1600 | if (zfs_btree_verify_intensity >= 5) { | |
c0bf952c | 1601 | for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { |
ca577779 PD |
1602 | zfs_btree_verify_poison_at(tree, keep_hdr, |
1603 | keep_hdr->bth_count + i); | |
1604 | } | |
1605 | } | |
1606 | ||
1607 | /* Move the separator into the left node. */ | |
1608 | uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; | |
1609 | uint8_t *separator = parent->btc_elems + (parent_idx - 1) * | |
1610 | size; | |
c0bf952c | 1611 | bcpy(separator, e_out, size); |
ca577779 PD |
1612 | keep_hdr->bth_count++; |
1613 | ||
1614 | /* Move all our elements and children into the left node. */ | |
1615 | bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, | |
1616 | keep_hdr->bth_count, BSS_TRAPEZOID); | |
1617 | ||
c0bf952c | 1618 | uint32_t old_count = keep_hdr->bth_count; |
ca577779 PD |
1619 | |
1620 | /* Update bookkeeping */ | |
1621 | keep_hdr->bth_count += new_rm_hdr->bth_count; | |
1622 | ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); | |
1623 | ||
1624 | /* | |
1625 | * Shift the element and children to the right of rm_hdr to | |
1626 | * the left by one spot. | |
1627 | */ | |
1628 | ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); | |
1629 | bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, | |
1630 | BSS_PARALLELOGRAM); | |
1631 | keep_hdr->bth_count--; | |
1632 | ||
1633 | /* Reparent all our children to point to the left node. */ | |
1634 | zfs_btree_hdr_t **new_start = keep->btc_children + | |
1635 | old_count - 1; | |
c0bf952c | 1636 | for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) |
ca577779 | 1637 | new_start[i]->bth_parent = keep; |
c0bf952c | 1638 | for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { |
ca577779 PD |
1639 | ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); |
1640 | ASSERT3P(keep->btc_children[i], !=, rm_hdr); | |
1641 | } | |
c0bf952c | 1642 | zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); |
ca577779 PD |
1643 | |
1644 | new_rm_hdr->bth_count = 0; | |
ca577779 | 1645 | zfs_btree_remove_from_node(tree, parent, new_rm_hdr); |
13f2b8fb | 1646 | zfs_btree_node_destroy(tree, new_rm_hdr); |
ca577779 PD |
1647 | } |
1648 | ||
1649 | /* Remove the element at the specific location. */ | |
1650 | void | |
516a83f8 | 1651 | zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) |
ca577779 PD |
1652 | { |
1653 | size_t size = tree->bt_elem_size; | |
1654 | zfs_btree_hdr_t *hdr = where->bti_node; | |
c0bf952c | 1655 | uint32_t idx = where->bti_offset; |
ca577779 PD |
1656 | |
1657 | ASSERT(!where->bti_before); | |
1658 | if (tree->bt_bulk != NULL) { | |
1659 | /* | |
1660 | * Leave bulk insert mode. Note that our index would be | |
1661 | * invalid after we correct the tree, so we copy the value | |
1662 | * we're planning to remove and find it again after | |
1663 | * bulk_finish. | |
1664 | */ | |
1665 | uint8_t *value = zfs_btree_get(tree, where); | |
1666 | uint8_t *tmp = kmem_alloc(size, KM_SLEEP); | |
c0bf952c | 1667 | bcpy(value, tmp, size); |
ca577779 PD |
1668 | zfs_btree_bulk_finish(tree); |
1669 | VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); | |
1670 | kmem_free(tmp, size); | |
1671 | hdr = where->bti_node; | |
1672 | idx = where->bti_offset; | |
1673 | } | |
1674 | ||
1675 | tree->bt_num_elems--; | |
1676 | /* | |
1677 | * If the element happens to be in a core node, we move a leaf node's | |
1678 | * element into its place and then remove the leaf node element. This | |
1679 | * makes the rebalance logic not need to be recursive both upwards and | |
1680 | * downwards. | |
1681 | */ | |
c0bf952c | 1682 | if (zfs_btree_is_core(hdr)) { |
ca577779 PD |
1683 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; |
1684 | zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; | |
1685 | void *new_value = zfs_btree_last_helper(tree, left_subtree, | |
1686 | where); | |
1687 | ASSERT3P(new_value, !=, NULL); | |
1688 | ||
c0bf952c | 1689 | bcpy(new_value, node->btc_elems + idx * size, size); |
ca577779 PD |
1690 | |
1691 | hdr = where->bti_node; | |
1692 | idx = where->bti_offset; | |
1693 | ASSERT(!where->bti_before); | |
1694 | } | |
1695 | ||
1696 | /* | |
1697 | * First, we'll update the leaf's metadata. Then, we shift any | |
1698 | * elements after the idx to the left. After that, we rebalance if | |
1699 | * needed. | |
1700 | */ | |
c0bf952c | 1701 | ASSERT(!zfs_btree_is_core(hdr)); |
ca577779 PD |
1702 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; |
1703 | ASSERT3U(hdr->bth_count, >, 0); | |
1704 | ||
c0bf952c | 1705 | uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; |
ca577779 PD |
1706 | |
1707 | /* | |
1708 | * If we're over the minimum size or this is the root, just overwrite | |
1709 | * the value and return. | |
1710 | */ | |
1711 | if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { | |
c0bf952c | 1712 | bt_shrink_leaf(tree, leaf, idx, 1); |
ca577779 PD |
1713 | if (hdr->bth_parent == NULL) { |
1714 | ASSERT0(tree->bt_height); | |
1715 | if (hdr->bth_count == 0) { | |
1716 | tree->bt_root = NULL; | |
1717 | tree->bt_height--; | |
1718 | zfs_btree_node_destroy(tree, &leaf->btl_hdr); | |
1719 | } | |
1720 | } | |
ca577779 PD |
1721 | zfs_btree_verify(tree); |
1722 | return; | |
1723 | } | |
1724 | ASSERT3U(hdr->bth_count, ==, min_count); | |
1725 | ||
1726 | /* | |
1727 | * Now we try to take a node from a sibling. We check left, then | |
1728 | * right. If they exist and have more than the minimum number of | |
dd4bc569 | 1729 | * elements, we move the separator between us and them to our node |
ca577779 PD |
1730 | * and move their closest element (last for left, first for right) to |
1731 | * the separator. Along the way we need to collapse the gap made by | |
1732 | * idx, and (for our right neighbor) the gap made by removing their | |
1733 | * first element. | |
1734 | * | |
1735 | * Note: this logic currently doesn't support taking from a neighbor | |
1736 | * that isn't a sibling. This isn't critical functionality, but may be | |
1737 | * worth implementing in the future for completeness' sake. | |
1738 | */ | |
1739 | zfs_btree_core_t *parent = hdr->bth_parent; | |
c0bf952c | 1740 | uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); |
ca577779 PD |
1741 | |
1742 | zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : | |
1743 | parent->btc_children[parent_idx - 1]); | |
1744 | if (l_hdr != NULL && l_hdr->bth_count > min_count) { | |
1745 | /* We can take a node from the left neighbor. */ | |
c0bf952c AM |
1746 | ASSERT(!zfs_btree_is_core(l_hdr)); |
1747 | zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; | |
ca577779 PD |
1748 | |
1749 | /* | |
1750 | * Move our elements back by one spot to make room for the | |
1751 | * stolen element and overwrite the element being removed. | |
1752 | */ | |
c0bf952c AM |
1753 | bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); |
1754 | ||
1755 | /* Move the separator to our first spot. */ | |
ca577779 PD |
1756 | uint8_t *separator = parent->btc_elems + (parent_idx - 1) * |
1757 | size; | |
c0bf952c | 1758 | bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); |
ca577779 PD |
1759 | |
1760 | /* Move our neighbor's last element to the separator. */ | |
c0bf952c AM |
1761 | uint8_t *take_elem = neighbor->btl_elems + |
1762 | (l_hdr->bth_first + l_hdr->bth_count - 1) * size; | |
1763 | bcpy(take_elem, separator, size); | |
ca577779 | 1764 | |
c0bf952c AM |
1765 | /* Delete our neighbor's last element. */ |
1766 | bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); | |
ca577779 PD |
1767 | zfs_btree_verify(tree); |
1768 | return; | |
1769 | } | |
1770 | ||
1771 | zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? | |
1772 | NULL : parent->btc_children[parent_idx + 1]); | |
1773 | if (r_hdr != NULL && r_hdr->bth_count > min_count) { | |
1774 | /* We can take a node from the right neighbor. */ | |
c0bf952c | 1775 | ASSERT(!zfs_btree_is_core(r_hdr)); |
ca577779 PD |
1776 | zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; |
1777 | ||
1778 | /* | |
1779 | * Move our elements after the element being removed forwards | |
1780 | * by one spot to make room for the stolen element and | |
1781 | * overwrite the element being removed. | |
1782 | */ | |
c0bf952c AM |
1783 | bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, |
1784 | 1, BSD_LEFT); | |
ca577779 | 1785 | |
ca577779 | 1786 | /* Move the separator between us to our last spot. */ |
c0bf952c AM |
1787 | uint8_t *separator = parent->btc_elems + parent_idx * size; |
1788 | bcpy(separator, leaf->btl_elems + (hdr->bth_first + | |
1789 | hdr->bth_count - 1) * size, size); | |
ca577779 PD |
1790 | |
1791 | /* Move our neighbor's first element to the separator. */ | |
c0bf952c AM |
1792 | uint8_t *take_elem = neighbor->btl_elems + |
1793 | r_hdr->bth_first * size; | |
1794 | bcpy(take_elem, separator, size); | |
ca577779 | 1795 | |
c0bf952c AM |
1796 | /* Delete our neighbor's first element. */ |
1797 | bt_shrink_leaf(tree, neighbor, 0, 1); | |
ca577779 PD |
1798 | zfs_btree_verify(tree); |
1799 | return; | |
1800 | } | |
1801 | ||
1802 | /* | |
1803 | * In this case, neither of our neighbors can spare an element, so we | |
c0bf952c AM |
1804 | * need to merge with one of them. We prefer the left one, arbitrarily. |
1805 | * After remove we move the separator into the leftmost merging node | |
ca577779 PD |
1806 | * (which may be us or the left neighbor), and then move the right |
1807 | * merging node's elements. Once that's done, we go back and delete | |
1808 | * the element we're removing. Finally, go into the parent and delete | |
1809 | * the right merging node and the separator. This may cause further | |
1810 | * merging. | |
1811 | */ | |
c0bf952c | 1812 | zfs_btree_hdr_t *rm_hdr, *k_hdr; |
ca577779 | 1813 | if (l_hdr != NULL) { |
c0bf952c | 1814 | k_hdr = l_hdr; |
ca577779 | 1815 | rm_hdr = hdr; |
ca577779 PD |
1816 | } else { |
1817 | ASSERT3P(r_hdr, !=, NULL); | |
c0bf952c | 1818 | k_hdr = hdr; |
ca577779 PD |
1819 | rm_hdr = r_hdr; |
1820 | parent_idx++; | |
1821 | } | |
c0bf952c AM |
1822 | ASSERT(!zfs_btree_is_core(k_hdr)); |
1823 | ASSERT(!zfs_btree_is_core(rm_hdr)); | |
1824 | ASSERT3U(k_hdr->bth_count, ==, min_count); | |
ca577779 | 1825 | ASSERT3U(rm_hdr->bth_count, ==, min_count); |
c0bf952c | 1826 | zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; |
ca577779 PD |
1827 | zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; |
1828 | ||
1829 | if (zfs_btree_verify_intensity >= 5) { | |
c0bf952c AM |
1830 | for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { |
1831 | zfs_btree_verify_poison_at(tree, k_hdr, | |
1832 | k_hdr->bth_count + i); | |
ca577779 PD |
1833 | } |
1834 | } | |
c0bf952c | 1835 | |
ca577779 | 1836 | /* |
c0bf952c AM |
1837 | * Remove the value from the node. It will go below the minimum, |
1838 | * but we'll fix it in no time. | |
ca577779 | 1839 | */ |
c0bf952c | 1840 | bt_shrink_leaf(tree, leaf, idx, 1); |
ca577779 | 1841 | |
c0bf952c AM |
1842 | /* Prepare space for elements to be moved from the right. */ |
1843 | uint32_t k_count = k_hdr->bth_count; | |
1844 | bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); | |
1845 | ASSERT3U(k_hdr->bth_count, ==, min_count * 2); | |
ca577779 | 1846 | |
c0bf952c AM |
1847 | /* Move the separator into the first open spot. */ |
1848 | uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; | |
1849 | uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; | |
1850 | bcpy(separator, out, size); | |
ca577779 | 1851 | |
c0bf952c AM |
1852 | /* Move our elements to the left neighbor. */ |
1853 | bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); | |
c0bf952c | 1854 | |
ca577779 PD |
1855 | /* Remove the emptied node from the parent. */ |
1856 | zfs_btree_remove_from_node(tree, parent, rm_hdr); | |
13f2b8fb | 1857 | zfs_btree_node_destroy(tree, rm_hdr); |
ca577779 PD |
1858 | zfs_btree_verify(tree); |
1859 | } | |
1860 | ||
1861 | /* Remove the given value from the tree. */ | |
1862 | void | |
1863 | zfs_btree_remove(zfs_btree_t *tree, const void *value) | |
1864 | { | |
1865 | zfs_btree_index_t where = {0}; | |
1866 | VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); | |
516a83f8 | 1867 | zfs_btree_remove_idx(tree, &where); |
ca577779 PD |
1868 | } |
1869 | ||
1870 | /* Return the number of elements in the tree. */ | |
1871 | ulong_t | |
1872 | zfs_btree_numnodes(zfs_btree_t *tree) | |
1873 | { | |
1874 | return (tree->bt_num_elems); | |
1875 | } | |
1876 | ||
1877 | /* | |
1878 | * This function is used to visit all the elements in the tree before | |
1879 | * destroying the tree. This allows the calling code to perform any cleanup it | |
1880 | * needs to do. This is more efficient than just removing the first element | |
1881 | * over and over, because it removes all rebalancing. Once the destroy_nodes() | |
1882 | * function has been called, no other btree operations are valid until it | |
1883 | * returns NULL, which point the only valid operation is zfs_btree_destroy(). | |
1884 | * | |
1885 | * example: | |
1886 | * | |
1887 | * zfs_btree_index_t *cookie = NULL; | |
1888 | * my_data_t *node; | |
1889 | * | |
1890 | * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) | |
1891 | * free(node->ptr); | |
1892 | * zfs_btree_destroy(tree); | |
1893 | * | |
1894 | */ | |
1895 | void * | |
1896 | zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) | |
1897 | { | |
1898 | if (*cookie == NULL) { | |
1899 | if (tree->bt_height == -1) | |
1900 | return (NULL); | |
1901 | *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); | |
1902 | return (zfs_btree_first(tree, *cookie)); | |
1903 | } | |
1904 | ||
1905 | void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, | |
1906 | zfs_btree_node_destroy); | |
1907 | if (rval == NULL) { | |
1908 | tree->bt_root = NULL; | |
1909 | tree->bt_height = -1; | |
1910 | tree->bt_num_elems = 0; | |
1911 | kmem_free(*cookie, sizeof (**cookie)); | |
1912 | tree->bt_bulk = NULL; | |
1913 | } | |
1914 | return (rval); | |
1915 | } | |
1916 | ||
1917 | static void | |
1918 | zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) | |
1919 | { | |
c0bf952c | 1920 | if (zfs_btree_is_core(hdr)) { |
ca577779 | 1921 | zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; |
c0bf952c | 1922 | for (uint32_t i = 0; i <= hdr->bth_count; i++) |
ca577779 | 1923 | zfs_btree_clear_helper(tree, btc->btc_children[i]); |
ca577779 PD |
1924 | } |
1925 | ||
1926 | zfs_btree_node_destroy(tree, hdr); | |
1927 | } | |
1928 | ||
1929 | void | |
1930 | zfs_btree_clear(zfs_btree_t *tree) | |
1931 | { | |
1932 | if (tree->bt_root == NULL) { | |
1933 | ASSERT0(tree->bt_num_elems); | |
1934 | return; | |
1935 | } | |
1936 | ||
1937 | zfs_btree_clear_helper(tree, tree->bt_root); | |
1938 | tree->bt_num_elems = 0; | |
1939 | tree->bt_root = NULL; | |
1940 | tree->bt_num_nodes = 0; | |
1941 | tree->bt_height = -1; | |
1942 | tree->bt_bulk = NULL; | |
1943 | } | |
1944 | ||
1945 | void | |
1946 | zfs_btree_destroy(zfs_btree_t *tree) | |
1947 | { | |
1948 | ASSERT0(tree->bt_num_elems); | |
1949 | ASSERT3P(tree->bt_root, ==, NULL); | |
1950 | } | |
1951 | ||
1952 | /* Verify that every child of this node has the correct parent pointer. */ | |
1953 | static void | |
1954 | zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) | |
1955 | { | |
c0bf952c | 1956 | if (!zfs_btree_is_core(hdr)) |
ca577779 PD |
1957 | return; |
1958 | ||
1959 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; | |
c0bf952c | 1960 | for (uint32_t i = 0; i <= hdr->bth_count; i++) { |
ca577779 PD |
1961 | VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); |
1962 | zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); | |
1963 | } | |
1964 | } | |
1965 | ||
1966 | /* Verify that every node has the correct parent pointer. */ | |
1967 | static void | |
1968 | zfs_btree_verify_pointers(zfs_btree_t *tree) | |
1969 | { | |
1970 | if (tree->bt_height == -1) { | |
1971 | VERIFY3P(tree->bt_root, ==, NULL); | |
1972 | return; | |
1973 | } | |
1974 | VERIFY3P(tree->bt_root->bth_parent, ==, NULL); | |
1975 | zfs_btree_verify_pointers_helper(tree, tree->bt_root); | |
1976 | } | |
1977 | ||
1978 | /* | |
1979 | * Verify that all the current node and its children satisfy the count | |
1980 | * invariants, and return the total count in the subtree rooted in this node. | |
1981 | */ | |
1982 | static uint64_t | |
1983 | zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) | |
1984 | { | |
c0bf952c | 1985 | if (!zfs_btree_is_core(hdr)) { |
63652e15 DS |
1986 | if (tree->bt_root != hdr && tree->bt_bulk && |
1987 | hdr != &tree->bt_bulk->btl_hdr) { | |
c0bf952c | 1988 | VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); |
ca577779 PD |
1989 | } |
1990 | ||
1991 | return (hdr->bth_count); | |
1992 | } else { | |
1993 | ||
1994 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; | |
1995 | uint64_t ret = hdr->bth_count; | |
1996 | if (tree->bt_root != hdr && tree->bt_bulk == NULL) | |
1997 | VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); | |
c0bf952c | 1998 | for (uint32_t i = 0; i <= hdr->bth_count; i++) { |
ca577779 PD |
1999 | ret += zfs_btree_verify_counts_helper(tree, |
2000 | node->btc_children[i]); | |
2001 | } | |
2002 | ||
2003 | return (ret); | |
2004 | } | |
2005 | } | |
2006 | ||
2007 | /* | |
2008 | * Verify that all nodes satisfy the invariants and that the total number of | |
2009 | * elements is correct. | |
2010 | */ | |
2011 | static void | |
2012 | zfs_btree_verify_counts(zfs_btree_t *tree) | |
2013 | { | |
2014 | EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); | |
2015 | if (tree->bt_height == -1) { | |
2016 | return; | |
2017 | } | |
2018 | VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, | |
2019 | tree->bt_num_elems); | |
2020 | } | |
2021 | ||
2022 | /* | |
2023 | * Check that the subtree rooted at this node has a uniform height. Returns | |
2024 | * the number of nodes under this node, to help verify bt_num_nodes. | |
2025 | */ | |
2026 | static uint64_t | |
2027 | zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, | |
9dcdee78 | 2028 | int32_t height) |
ca577779 | 2029 | { |
c0bf952c | 2030 | if (!zfs_btree_is_core(hdr)) { |
ca577779 PD |
2031 | VERIFY0(height); |
2032 | return (1); | |
2033 | } | |
2034 | ||
ca577779 PD |
2035 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; |
2036 | uint64_t ret = 1; | |
c0bf952c | 2037 | for (uint32_t i = 0; i <= hdr->bth_count; i++) { |
ca577779 PD |
2038 | ret += zfs_btree_verify_height_helper(tree, |
2039 | node->btc_children[i], height - 1); | |
2040 | } | |
2041 | return (ret); | |
2042 | } | |
2043 | ||
2044 | /* | |
2045 | * Check that the tree rooted at this node has a uniform height, and that the | |
2046 | * bt_height in the tree is correct. | |
2047 | */ | |
2048 | static void | |
2049 | zfs_btree_verify_height(zfs_btree_t *tree) | |
2050 | { | |
2051 | EQUIV(tree->bt_height == -1, tree->bt_root == NULL); | |
2052 | if (tree->bt_height == -1) { | |
2053 | return; | |
2054 | } | |
2055 | ||
2056 | VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, | |
2057 | tree->bt_height), ==, tree->bt_num_nodes); | |
2058 | } | |
2059 | ||
2060 | /* | |
2061 | * Check that the elements in this node are sorted, and that if this is a core | |
2062 | * node, the separators are properly between the subtrees they separaate and | |
2063 | * that the children also satisfy this requirement. | |
2064 | */ | |
2065 | static void | |
2066 | zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) | |
2067 | { | |
2068 | size_t size = tree->bt_elem_size; | |
c0bf952c | 2069 | if (!zfs_btree_is_core(hdr)) { |
ca577779 | 2070 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; |
c0bf952c AM |
2071 | for (uint32_t i = 1; i < hdr->bth_count; i++) { |
2072 | VERIFY3S(tree->bt_compar(leaf->btl_elems + | |
2073 | (hdr->bth_first + i - 1) * size, | |
2074 | leaf->btl_elems + | |
2075 | (hdr->bth_first + i) * size), ==, -1); | |
ca577779 PD |
2076 | } |
2077 | return; | |
2078 | } | |
2079 | ||
2080 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; | |
c0bf952c | 2081 | for (uint32_t i = 1; i < hdr->bth_count; i++) { |
ca577779 PD |
2082 | VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, |
2083 | node->btc_elems + i * size), ==, -1); | |
2084 | } | |
c0bf952c | 2085 | for (uint32_t i = 0; i < hdr->bth_count; i++) { |
ca577779 PD |
2086 | uint8_t *left_child_last = NULL; |
2087 | zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; | |
c0bf952c | 2088 | if (zfs_btree_is_core(left_child_hdr)) { |
ca577779 PD |
2089 | zfs_btree_core_t *left_child = |
2090 | (zfs_btree_core_t *)left_child_hdr; | |
2091 | left_child_last = left_child->btc_elems + | |
2092 | (left_child_hdr->bth_count - 1) * size; | |
2093 | } else { | |
2094 | zfs_btree_leaf_t *left_child = | |
2095 | (zfs_btree_leaf_t *)left_child_hdr; | |
2096 | left_child_last = left_child->btl_elems + | |
c0bf952c AM |
2097 | (left_child_hdr->bth_first + |
2098 | left_child_hdr->bth_count - 1) * size; | |
ca577779 | 2099 | } |
c0bf952c AM |
2100 | int comp = tree->bt_compar(node->btc_elems + i * size, |
2101 | left_child_last); | |
2102 | if (comp <= 0) { | |
ca577779 | 2103 | panic("btree: compar returned %d (expected 1) at " |
c0bf952c AM |
2104 | "%px %d: compar(%px, %px)", comp, node, i, |
2105 | node->btc_elems + i * size, left_child_last); | |
ca577779 PD |
2106 | } |
2107 | ||
2108 | uint8_t *right_child_first = NULL; | |
2109 | zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; | |
c0bf952c | 2110 | if (zfs_btree_is_core(right_child_hdr)) { |
ca577779 PD |
2111 | zfs_btree_core_t *right_child = |
2112 | (zfs_btree_core_t *)right_child_hdr; | |
2113 | right_child_first = right_child->btc_elems; | |
2114 | } else { | |
2115 | zfs_btree_leaf_t *right_child = | |
2116 | (zfs_btree_leaf_t *)right_child_hdr; | |
c0bf952c AM |
2117 | right_child_first = right_child->btl_elems + |
2118 | right_child_hdr->bth_first * size; | |
ca577779 | 2119 | } |
c0bf952c AM |
2120 | comp = tree->bt_compar(node->btc_elems + i * size, |
2121 | right_child_first); | |
2122 | if (comp >= 0) { | |
ca577779 | 2123 | panic("btree: compar returned %d (expected -1) at " |
c0bf952c AM |
2124 | "%px %d: compar(%px, %px)", comp, node, i, |
2125 | node->btc_elems + i * size, right_child_first); | |
ca577779 PD |
2126 | } |
2127 | } | |
c0bf952c | 2128 | for (uint32_t i = 0; i <= hdr->bth_count; i++) |
ca577779 | 2129 | zfs_btree_verify_order_helper(tree, node->btc_children[i]); |
ca577779 PD |
2130 | } |
2131 | ||
2132 | /* Check that all elements in the tree are in sorted order. */ | |
2133 | static void | |
2134 | zfs_btree_verify_order(zfs_btree_t *tree) | |
2135 | { | |
2136 | EQUIV(tree->bt_height == -1, tree->bt_root == NULL); | |
2137 | if (tree->bt_height == -1) { | |
2138 | return; | |
2139 | } | |
2140 | ||
2141 | zfs_btree_verify_order_helper(tree, tree->bt_root); | |
2142 | } | |
2143 | ||
2144 | #ifdef ZFS_DEBUG | |
2145 | /* Check that all unused memory is poisoned correctly. */ | |
2146 | static void | |
2147 | zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) | |
2148 | { | |
2149 | size_t size = tree->bt_elem_size; | |
c0bf952c | 2150 | if (!zfs_btree_is_core(hdr)) { |
ca577779 | 2151 | zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; |
c0bf952c AM |
2152 | for (size_t i = 0; i < hdr->bth_first * size; i++) |
2153 | VERIFY3U(leaf->btl_elems[i], ==, 0x0f); | |
9dcdee78 AM |
2154 | size_t esize = tree->bt_leaf_size - |
2155 | offsetof(zfs_btree_leaf_t, btl_elems); | |
c0bf952c | 2156 | for (size_t i = (hdr->bth_first + hdr->bth_count) * size; |
9dcdee78 | 2157 | i < esize; i++) |
c0bf952c | 2158 | VERIFY3U(leaf->btl_elems[i], ==, 0x0f); |
ca577779 PD |
2159 | } else { |
2160 | zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; | |
c0bf952c AM |
2161 | for (size_t i = hdr->bth_count * size; |
2162 | i < BTREE_CORE_ELEMS * size; i++) | |
2163 | VERIFY3U(node->btc_elems[i], ==, 0x0f); | |
ca577779 | 2164 | |
c0bf952c AM |
2165 | for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; |
2166 | i++) { | |
ca577779 PD |
2167 | VERIFY3P(node->btc_children[i], ==, |
2168 | (zfs_btree_hdr_t *)BTREE_POISON); | |
2169 | } | |
2170 | ||
c0bf952c | 2171 | for (uint32_t i = 0; i <= hdr->bth_count; i++) { |
ca577779 PD |
2172 | zfs_btree_verify_poison_helper(tree, |
2173 | node->btc_children[i]); | |
2174 | } | |
2175 | } | |
2176 | } | |
2177 | #endif | |
2178 | ||
2179 | /* Check that unused memory in the tree is still poisoned. */ | |
2180 | static void | |
2181 | zfs_btree_verify_poison(zfs_btree_t *tree) | |
2182 | { | |
2183 | #ifdef ZFS_DEBUG | |
2184 | if (tree->bt_height == -1) | |
2185 | return; | |
2186 | zfs_btree_verify_poison_helper(tree, tree->bt_root); | |
2187 | #endif | |
2188 | } | |
2189 | ||
2190 | void | |
2191 | zfs_btree_verify(zfs_btree_t *tree) | |
2192 | { | |
2193 | if (zfs_btree_verify_intensity == 0) | |
2194 | return; | |
2195 | zfs_btree_verify_height(tree); | |
2196 | if (zfs_btree_verify_intensity == 1) | |
2197 | return; | |
2198 | zfs_btree_verify_pointers(tree); | |
2199 | if (zfs_btree_verify_intensity == 2) | |
2200 | return; | |
2201 | zfs_btree_verify_counts(tree); | |
2202 | if (zfs_btree_verify_intensity == 3) | |
2203 | return; | |
2204 | zfs_btree_verify_order(tree); | |
2205 | ||
2206 | if (zfs_btree_verify_intensity == 4) | |
2207 | return; | |
2208 | zfs_btree_verify_poison(tree); | |
2209 | } | |
b24d1c77 RY |
2210 | |
2211 | /* BEGIN CSTYLED */ | |
2212 | ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, | |
2213 | "Enable btree verification. Levels above 4 require ZFS be built " | |
2214 | "with debugging"); | |
2215 | /* END CSTYLED */ |