]>
Commit | Line | Data |
---|---|---|
93cf2076 GW |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
93cf2076 GW |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | /* | |
93e28d66 | 26 | * Copyright (c) 2013, 2019 by Delphix. All rights reserved. |
dce63135 | 27 | * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. |
93cf2076 GW |
28 | */ |
29 | ||
30 | #include <sys/zfs_context.h> | |
31 | #include <sys/spa.h> | |
32 | #include <sys/dmu.h> | |
33 | #include <sys/dnode.h> | |
34 | #include <sys/zio.h> | |
35 | #include <sys/range_tree.h> | |
36 | ||
d4a72f23 TC |
37 | /* |
38 | * Range trees are tree-based data structures that can be used to | |
39 | * track free space or generally any space allocation information. | |
40 | * A range tree keeps track of individual segments and automatically | |
41 | * provides facilities such as adjacent extent merging and extent | |
42 | * splitting in response to range add/remove requests. | |
43 | * | |
44 | * A range tree starts out completely empty, with no segments in it. | |
45 | * Adding an allocation via range_tree_add to the range tree can either: | |
46 | * 1) create a new extent | |
47 | * 2) extend an adjacent extent | |
48 | * 3) merge two adjacent extents | |
49 | * Conversely, removing an allocation via range_tree_remove can: | |
50 | * 1) completely remove an extent | |
51 | * 2) shorten an extent (if the allocation was near one of its ends) | |
52 | * 3) split an extent into two extents, in effect punching a hole | |
53 | * | |
54 | * A range tree is also capable of 'bridging' gaps when adding | |
55 | * allocations. This is useful for cases when close proximity of | |
56 | * allocations is an important detail that needs to be represented | |
57 | * in the range tree. See range_tree_set_gap(). The default behavior | |
58 | * is not to bridge gaps (i.e. the maximum allowed gap size is 0). | |
59 | * | |
60 | * In order to traverse a range tree, use either the range_tree_walk() | |
61 | * or range_tree_vacate() functions. | |
62 | * | |
63 | * To obtain more accurate information on individual segment | |
64 | * operations that the range tree performs "under the hood", you can | |
65 | * specify a set of callbacks by passing a range_tree_ops_t structure | |
66 | * to the range_tree_create function. Any callbacks that are non-NULL | |
67 | * are then called at the appropriate times. | |
68 | * | |
69 | * The range tree code also supports a special variant of range trees | |
70 | * that can bridge small gaps between segments. This kind of tree is used | |
71 | * by the dsl scanning code to group I/Os into mostly sequential chunks to | |
72 | * optimize disk performance. The code here attempts to do this with as | |
73 | * little memory and computational overhead as possible. One limitation of | |
74 | * this implementation is that segments of range trees with gaps can only | |
75 | * support removing complete segments. | |
76 | */ | |
77 | ||
ca577779 PD |
78 | static inline void |
79 | rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) | |
80 | { | |
861166b0 | 81 | ASSERT3U(rt->rt_type, <, RANGE_SEG_NUM_TYPES); |
ca577779 PD |
82 | size_t size = 0; |
83 | switch (rt->rt_type) { | |
84 | case RANGE_SEG32: | |
85 | size = sizeof (range_seg32_t); | |
86 | break; | |
87 | case RANGE_SEG64: | |
88 | size = sizeof (range_seg64_t); | |
89 | break; | |
90 | case RANGE_SEG_GAP: | |
91 | size = sizeof (range_seg_gap_t); | |
92 | break; | |
93 | default: | |
861166b0 | 94 | __builtin_unreachable(); |
ca577779 | 95 | } |
861166b0 | 96 | memcpy(dest, src, size); |
93cf2076 GW |
97 | } |
98 | ||
99 | void | |
100 | range_tree_stat_verify(range_tree_t *rt) | |
101 | { | |
102 | range_seg_t *rs; | |
ca577779 | 103 | zfs_btree_index_t where; |
93cf2076 GW |
104 | uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; |
105 | int i; | |
106 | ||
ca577779 PD |
107 | for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; |
108 | rs = zfs_btree_next(&rt->rt_root, &where, &where)) { | |
109 | uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); | |
9bd274dd | 110 | int idx = highbit64(size) - 1; |
93cf2076 GW |
111 | |
112 | hist[idx]++; | |
113 | ASSERT3U(hist[idx], !=, 0); | |
114 | } | |
115 | ||
116 | for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { | |
117 | if (hist[i] != rt->rt_histogram[i]) { | |
a887d653 | 118 | zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu", |
8e739b2c RE |
119 | i, hist, (u_longlong_t)hist[i], |
120 | (u_longlong_t)rt->rt_histogram[i]); | |
93cf2076 GW |
121 | } |
122 | VERIFY3U(hist[i], ==, rt->rt_histogram[i]); | |
123 | } | |
124 | } | |
125 | ||
126 | static void | |
127 | range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) | |
128 | { | |
ca577779 | 129 | uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); |
9bd274dd | 130 | int idx = highbit64(size) - 1; |
93cf2076 | 131 | |
f3a7f661 | 132 | ASSERT(size != 0); |
93cf2076 GW |
133 | ASSERT3U(idx, <, |
134 | sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); | |
135 | ||
93cf2076 GW |
136 | rt->rt_histogram[idx]++; |
137 | ASSERT3U(rt->rt_histogram[idx], !=, 0); | |
138 | } | |
139 | ||
140 | static void | |
141 | range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) | |
142 | { | |
ca577779 | 143 | uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); |
9bd274dd | 144 | int idx = highbit64(size) - 1; |
93cf2076 | 145 | |
f3a7f661 | 146 | ASSERT(size != 0); |
93cf2076 GW |
147 | ASSERT3U(idx, <, |
148 | sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); | |
149 | ||
93cf2076 GW |
150 | ASSERT3U(rt->rt_histogram[idx], !=, 0); |
151 | rt->rt_histogram[idx]--; | |
152 | } | |
153 | ||
677c6f84 | 154 | __attribute__((always_inline)) inline |
93cf2076 | 155 | static int |
ca577779 PD |
156 | range_tree_seg32_compare(const void *x1, const void *x2) |
157 | { | |
158 | const range_seg32_t *r1 = x1; | |
159 | const range_seg32_t *r2 = x2; | |
160 | ||
161 | ASSERT3U(r1->rs_start, <=, r1->rs_end); | |
162 | ASSERT3U(r2->rs_start, <=, r2->rs_end); | |
163 | ||
164 | return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); | |
165 | } | |
166 | ||
677c6f84 | 167 | __attribute__((always_inline)) inline |
ca577779 PD |
168 | static int |
169 | range_tree_seg64_compare(const void *x1, const void *x2) | |
93cf2076 | 170 | { |
ca577779 PD |
171 | const range_seg64_t *r1 = x1; |
172 | const range_seg64_t *r2 = x2; | |
173 | ||
174 | ASSERT3U(r1->rs_start, <=, r1->rs_end); | |
175 | ASSERT3U(r2->rs_start, <=, r2->rs_end); | |
176 | ||
177 | return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); | |
178 | } | |
179 | ||
677c6f84 | 180 | __attribute__((always_inline)) inline |
ca577779 PD |
181 | static int |
182 | range_tree_seg_gap_compare(const void *x1, const void *x2) | |
183 | { | |
184 | const range_seg_gap_t *r1 = x1; | |
185 | const range_seg_gap_t *r2 = x2; | |
93cf2076 | 186 | |
ee36c709 GN |
187 | ASSERT3U(r1->rs_start, <=, r1->rs_end); |
188 | ASSERT3U(r2->rs_start, <=, r2->rs_end); | |
189 | ||
190 | return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); | |
93cf2076 GW |
191 | } |
192 | ||
677c6f84 RY |
193 | ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, |
194 | range_tree_seg32_compare) | |
195 | ||
196 | ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, | |
197 | range_tree_seg64_compare) | |
198 | ||
199 | ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, | |
200 | range_tree_seg_gap_compare) | |
201 | ||
93cf2076 | 202 | range_tree_t * |
1c0c729a AM |
203 | range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, |
204 | void *arg, uint64_t start, uint64_t shift, uint64_t gap) | |
93cf2076 | 205 | { |
d4a72f23 | 206 | range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); |
93cf2076 | 207 | |
ca577779 PD |
208 | ASSERT3U(shift, <, 64); |
209 | ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); | |
210 | size_t size; | |
211 | int (*compare) (const void *, const void *); | |
677c6f84 | 212 | bt_find_in_buf_f bt_find; |
ca577779 PD |
213 | switch (type) { |
214 | case RANGE_SEG32: | |
215 | size = sizeof (range_seg32_t); | |
216 | compare = range_tree_seg32_compare; | |
677c6f84 | 217 | bt_find = range_tree_seg32_find_in_buf; |
ca577779 PD |
218 | break; |
219 | case RANGE_SEG64: | |
220 | size = sizeof (range_seg64_t); | |
221 | compare = range_tree_seg64_compare; | |
677c6f84 | 222 | bt_find = range_tree_seg64_find_in_buf; |
ca577779 PD |
223 | break; |
224 | case RANGE_SEG_GAP: | |
225 | size = sizeof (range_seg_gap_t); | |
226 | compare = range_tree_seg_gap_compare; | |
677c6f84 | 227 | bt_find = range_tree_seg_gap_find_in_buf; |
ca577779 PD |
228 | break; |
229 | default: | |
230 | panic("Invalid range seg type %d", type); | |
231 | } | |
677c6f84 | 232 | zfs_btree_create(&rt->rt_root, compare, bt_find, size); |
93cf2076 | 233 | |
93cf2076 | 234 | rt->rt_ops = ops; |
d4a72f23 | 235 | rt->rt_gap = gap; |
93cf2076 | 236 | rt->rt_arg = arg; |
ca577779 PD |
237 | rt->rt_type = type; |
238 | rt->rt_start = start; | |
239 | rt->rt_shift = shift; | |
93cf2076 | 240 | |
d4a72f23 | 241 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) |
93cf2076 GW |
242 | rt->rt_ops->rtop_create(rt, rt->rt_arg); |
243 | ||
244 | return (rt); | |
245 | } | |
246 | ||
d4a72f23 | 247 | range_tree_t * |
18168da7 | 248 | range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type, |
ca577779 | 249 | void *arg, uint64_t start, uint64_t shift) |
d4a72f23 | 250 | { |
1c0c729a | 251 | return (range_tree_create_gap(ops, type, arg, start, shift, 0)); |
d4a72f23 TC |
252 | } |
253 | ||
93cf2076 GW |
254 | void |
255 | range_tree_destroy(range_tree_t *rt) | |
256 | { | |
257 | VERIFY0(rt->rt_space); | |
258 | ||
d4a72f23 | 259 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) |
93cf2076 GW |
260 | rt->rt_ops->rtop_destroy(rt, rt->rt_arg); |
261 | ||
ca577779 | 262 | zfs_btree_destroy(&rt->rt_root); |
93cf2076 GW |
263 | kmem_free(rt, sizeof (*rt)); |
264 | } | |
265 | ||
266 | void | |
d4a72f23 TC |
267 | range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) |
268 | { | |
12045d02 PD |
269 | if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) { |
270 | zfs_panic_recover("zfs: attempting to decrease fill to or " | |
271 | "below 0; probable double remove in segment [%llx:%llx]", | |
272 | (longlong_t)rs_get_start(rs, rt), | |
273 | (longlong_t)rs_get_end(rs, rt)); | |
274 | } | |
275 | if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) - | |
276 | rs_get_start(rs, rt)) { | |
277 | zfs_panic_recover("zfs: attempting to increase fill beyond " | |
278 | "max; probable double add in segment [%llx:%llx]", | |
279 | (longlong_t)rs_get_start(rs, rt), | |
280 | (longlong_t)rs_get_end(rs, rt)); | |
281 | } | |
d4a72f23 TC |
282 | |
283 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) | |
284 | rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); | |
ca577779 | 285 | rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); |
d4a72f23 TC |
286 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) |
287 | rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); | |
288 | } | |
289 | ||
290 | static void | |
291 | range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) | |
93cf2076 GW |
292 | { |
293 | range_tree_t *rt = arg; | |
ca577779 PD |
294 | zfs_btree_index_t where; |
295 | range_seg_t *rs_before, *rs_after, *rs; | |
296 | range_seg_max_t tmp, rsearch; | |
d4a72f23 TC |
297 | uint64_t end = start + size, gap = rt->rt_gap; |
298 | uint64_t bridge_size = 0; | |
93cf2076 GW |
299 | boolean_t merge_before, merge_after; |
300 | ||
d4a72f23 TC |
301 | ASSERT3U(size, !=, 0); |
302 | ASSERT3U(fill, <=, size); | |
ca577779 | 303 | ASSERT3U(start + size, >, start); |
93cf2076 | 304 | |
ca577779 PD |
305 | rs_set_start(&rsearch, rt, start); |
306 | rs_set_end(&rsearch, rt, end); | |
307 | rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); | |
d4a72f23 TC |
308 | |
309 | /* | |
310 | * If this is a gap-supporting range tree, it is possible that we | |
311 | * are inserting into an existing segment. In this case simply | |
312 | * bump the fill count and call the remove / add callbacks. If the | |
313 | * new range will extend an existing segment, we remove the | |
314 | * existing one, apply the new extent to it and re-insert it using | |
315 | * the normal code paths. | |
316 | */ | |
317 | if (rs != NULL) { | |
12045d02 PD |
318 | if (gap == 0) { |
319 | zfs_panic_recover("zfs: adding existent segment to " | |
320 | "range tree (offset=%llx size=%llx)", | |
321 | (longlong_t)start, (longlong_t)size); | |
322 | return; | |
323 | } | |
ca577779 PD |
324 | uint64_t rstart = rs_get_start(rs, rt); |
325 | uint64_t rend = rs_get_end(rs, rt); | |
ca577779 | 326 | if (rstart <= start && rend >= end) { |
d4a72f23 TC |
327 | range_tree_adjust_fill(rt, rs, fill); |
328 | return; | |
329 | } | |
330 | ||
d4a72f23 TC |
331 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) |
332 | rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); | |
333 | ||
334 | range_tree_stat_decr(rt, rs); | |
ca577779 | 335 | rt->rt_space -= rend - rstart; |
d4a72f23 | 336 | |
ca577779 PD |
337 | fill += rs_get_fill(rs, rt); |
338 | start = MIN(start, rstart); | |
339 | end = MAX(end, rend); | |
d4a72f23 TC |
340 | size = end - start; |
341 | ||
6a60ef80 | 342 | zfs_btree_remove(&rt->rt_root, rs); |
d4a72f23 | 343 | range_tree_add_impl(rt, start, size, fill); |
93cf2076 GW |
344 | return; |
345 | } | |
346 | ||
d4a72f23 | 347 | ASSERT3P(rs, ==, NULL); |
93cf2076 | 348 | |
d4a72f23 TC |
349 | /* |
350 | * Determine whether or not we will have to merge with our neighbors. | |
351 | * If gap != 0, we might need to merge with our neighbors even if we | |
352 | * aren't directly touching. | |
353 | */ | |
ca577779 PD |
354 | zfs_btree_index_t where_before, where_after; |
355 | rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); | |
356 | rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); | |
93cf2076 | 357 | |
ca577779 PD |
358 | merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= |
359 | start - gap); | |
360 | merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + | |
361 | gap); | |
d4a72f23 TC |
362 | |
363 | if (merge_before && gap != 0) | |
ca577779 | 364 | bridge_size += start - rs_get_end(rs_before, rt); |
d4a72f23 | 365 | if (merge_after && gap != 0) |
ca577779 | 366 | bridge_size += rs_get_start(rs_after, rt) - end; |
93cf2076 GW |
367 | |
368 | if (merge_before && merge_after) { | |
d4a72f23 | 369 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { |
93cf2076 GW |
370 | rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); |
371 | rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); | |
372 | } | |
373 | ||
374 | range_tree_stat_decr(rt, rs_before); | |
375 | range_tree_stat_decr(rt, rs_after); | |
376 | ||
ca577779 PD |
377 | rs_copy(rs_after, &tmp, rt); |
378 | uint64_t before_start = rs_get_start_raw(rs_before, rt); | |
379 | uint64_t before_fill = rs_get_fill(rs_before, rt); | |
380 | uint64_t after_fill = rs_get_fill(rs_after, rt); | |
516a83f8 | 381 | zfs_btree_remove_idx(&rt->rt_root, &where_before); |
ca577779 PD |
382 | |
383 | /* | |
384 | * We have to re-find the node because our old reference is | |
385 | * invalid as soon as we do any mutating btree operations. | |
386 | */ | |
387 | rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); | |
a6ccb36b | 388 | ASSERT3P(rs_after, !=, NULL); |
ca577779 PD |
389 | rs_set_start_raw(rs_after, rt, before_start); |
390 | rs_set_fill(rs_after, rt, after_fill + before_fill + fill); | |
93cf2076 GW |
391 | rs = rs_after; |
392 | } else if (merge_before) { | |
d4a72f23 | 393 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) |
93cf2076 GW |
394 | rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); |
395 | ||
396 | range_tree_stat_decr(rt, rs_before); | |
397 | ||
ca577779 PD |
398 | uint64_t before_fill = rs_get_fill(rs_before, rt); |
399 | rs_set_end(rs_before, rt, end); | |
400 | rs_set_fill(rs_before, rt, before_fill + fill); | |
93cf2076 GW |
401 | rs = rs_before; |
402 | } else if (merge_after) { | |
d4a72f23 | 403 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) |
93cf2076 GW |
404 | rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); |
405 | ||
406 | range_tree_stat_decr(rt, rs_after); | |
407 | ||
ca577779 PD |
408 | uint64_t after_fill = rs_get_fill(rs_after, rt); |
409 | rs_set_start(rs_after, rt, start); | |
410 | rs_set_fill(rs_after, rt, after_fill + fill); | |
93cf2076 GW |
411 | rs = rs_after; |
412 | } else { | |
ca577779 | 413 | rs = &tmp; |
d4a72f23 | 414 | |
ca577779 PD |
415 | rs_set_start(rs, rt, start); |
416 | rs_set_end(rs, rt, end); | |
417 | rs_set_fill(rs, rt, fill); | |
516a83f8 | 418 | zfs_btree_add_idx(&rt->rt_root, rs, &where); |
93cf2076 GW |
419 | } |
420 | ||
ca577779 PD |
421 | if (gap != 0) { |
422 | ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - | |
423 | rs_get_start(rs, rt)); | |
424 | } else { | |
425 | ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - | |
426 | rs_get_start(rs, rt)); | |
427 | } | |
d4a72f23 TC |
428 | |
429 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) | |
93cf2076 GW |
430 | rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); |
431 | ||
432 | range_tree_stat_incr(rt, rs); | |
d4a72f23 | 433 | rt->rt_space += size + bridge_size; |
93cf2076 GW |
434 | } |
435 | ||
436 | void | |
d4a72f23 TC |
437 | range_tree_add(void *arg, uint64_t start, uint64_t size) |
438 | { | |
439 | range_tree_add_impl(arg, start, size, size); | |
440 | } | |
441 | ||
442 | static void | |
443 | range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, | |
444 | boolean_t do_fill) | |
93cf2076 | 445 | { |
ca577779 PD |
446 | zfs_btree_index_t where; |
447 | range_seg_t *rs; | |
448 | range_seg_max_t rsearch, rs_tmp; | |
93cf2076 GW |
449 | uint64_t end = start + size; |
450 | boolean_t left_over, right_over; | |
451 | ||
93cf2076 GW |
452 | VERIFY3U(size, !=, 0); |
453 | VERIFY3U(size, <=, rt->rt_space); | |
ca577779 PD |
454 | if (rt->rt_type == RANGE_SEG64) |
455 | ASSERT3U(start + size, >, start); | |
93cf2076 | 456 | |
ca577779 PD |
457 | rs_set_start(&rsearch, rt, start); |
458 | rs_set_end(&rsearch, rt, end); | |
459 | rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); | |
93cf2076 GW |
460 | |
461 | /* Make sure we completely overlap with someone */ | |
462 | if (rs == NULL) { | |
ca577779 | 463 | zfs_panic_recover("zfs: removing nonexistent segment from " |
12045d02 | 464 | "range tree (offset=%llx size=%llx)", |
93cf2076 GW |
465 | (longlong_t)start, (longlong_t)size); |
466 | return; | |
467 | } | |
d4a72f23 TC |
468 | |
469 | /* | |
470 | * Range trees with gap support must only remove complete segments | |
471 | * from the tree. This allows us to maintain accurate fill accounting | |
472 | * and to ensure that bridged sections are not leaked. If we need to | |
473 | * remove less than the full segment, we can only adjust the fill count. | |
474 | */ | |
475 | if (rt->rt_gap != 0) { | |
476 | if (do_fill) { | |
ca577779 PD |
477 | if (rs_get_fill(rs, rt) == size) { |
478 | start = rs_get_start(rs, rt); | |
479 | end = rs_get_end(rs, rt); | |
d4a72f23 TC |
480 | size = end - start; |
481 | } else { | |
482 | range_tree_adjust_fill(rt, rs, -size); | |
483 | return; | |
484 | } | |
ca577779 PD |
485 | } else if (rs_get_start(rs, rt) != start || |
486 | rs_get_end(rs, rt) != end) { | |
d4a72f23 | 487 | zfs_panic_recover("zfs: freeing partial segment of " |
12045d02 PD |
488 | "gap tree (offset=%llx size=%llx) of " |
489 | "(offset=%llx size=%llx)", | |
d4a72f23 | 490 | (longlong_t)start, (longlong_t)size, |
ca577779 PD |
491 | (longlong_t)rs_get_start(rs, rt), |
492 | (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, | |
493 | rt)); | |
d4a72f23 TC |
494 | return; |
495 | } | |
496 | } | |
497 | ||
ca577779 PD |
498 | VERIFY3U(rs_get_start(rs, rt), <=, start); |
499 | VERIFY3U(rs_get_end(rs, rt), >=, end); | |
93cf2076 | 500 | |
ca577779 PD |
501 | left_over = (rs_get_start(rs, rt) != start); |
502 | right_over = (rs_get_end(rs, rt) != end); | |
93cf2076 GW |
503 | |
504 | range_tree_stat_decr(rt, rs); | |
505 | ||
d4a72f23 | 506 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) |
93cf2076 GW |
507 | rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); |
508 | ||
509 | if (left_over && right_over) { | |
ca577779 PD |
510 | range_seg_max_t newseg; |
511 | rs_set_start(&newseg, rt, end); | |
512 | rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); | |
513 | rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); | |
514 | range_tree_stat_incr(rt, &newseg); | |
93cf2076 | 515 | |
ca577779 PD |
516 | // This modifies the buffer already inside the range tree |
517 | rs_set_end(rs, rt, start); | |
518 | ||
519 | rs_copy(rs, &rs_tmp, rt); | |
520 | if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) | |
516a83f8 | 521 | zfs_btree_add_idx(&rt->rt_root, &newseg, &where); |
ca577779 PD |
522 | else |
523 | zfs_btree_add(&rt->rt_root, &newseg); | |
93cf2076 | 524 | |
d4a72f23 | 525 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) |
ca577779 | 526 | rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); |
93cf2076 | 527 | } else if (left_over) { |
ca577779 PD |
528 | // This modifies the buffer already inside the range tree |
529 | rs_set_end(rs, rt, start); | |
530 | rs_copy(rs, &rs_tmp, rt); | |
93cf2076 | 531 | } else if (right_over) { |
ca577779 PD |
532 | // This modifies the buffer already inside the range tree |
533 | rs_set_start(rs, rt, end); | |
534 | rs_copy(rs, &rs_tmp, rt); | |
93cf2076 | 535 | } else { |
516a83f8 | 536 | zfs_btree_remove_idx(&rt->rt_root, &where); |
93cf2076 GW |
537 | rs = NULL; |
538 | } | |
539 | ||
540 | if (rs != NULL) { | |
d4a72f23 TC |
541 | /* |
542 | * The fill of the leftover segment will always be equal to | |
543 | * the size, since we do not support removing partial segments | |
544 | * of range trees with gaps. | |
545 | */ | |
ca577779 PD |
546 | rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - |
547 | rs_get_start_raw(rs, rt)); | |
548 | range_tree_stat_incr(rt, &rs_tmp); | |
93cf2076 | 549 | |
d4a72f23 | 550 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) |
ca577779 | 551 | rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); |
93cf2076 GW |
552 | } |
553 | ||
554 | rt->rt_space -= size; | |
555 | } | |
556 | ||
d4a72f23 TC |
557 | void |
558 | range_tree_remove(void *arg, uint64_t start, uint64_t size) | |
559 | { | |
560 | range_tree_remove_impl(arg, start, size, B_FALSE); | |
561 | } | |
562 | ||
563 | void | |
564 | range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) | |
565 | { | |
566 | range_tree_remove_impl(rt, start, size, B_TRUE); | |
567 | } | |
568 | ||
569 | void | |
570 | range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, | |
571 | uint64_t newstart, uint64_t newsize) | |
572 | { | |
ca577779 | 573 | int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); |
d4a72f23 | 574 | |
d4a72f23 TC |
575 | range_tree_stat_decr(rt, rs); |
576 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) | |
577 | rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); | |
578 | ||
ca577779 PD |
579 | rs_set_start(rs, rt, newstart); |
580 | rs_set_end(rs, rt, newstart + newsize); | |
d4a72f23 TC |
581 | |
582 | range_tree_stat_incr(rt, rs); | |
583 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) | |
584 | rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); | |
585 | ||
586 | rt->rt_space += delta; | |
587 | } | |
588 | ||
93cf2076 | 589 | static range_seg_t * |
9bd274dd | 590 | range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) |
93cf2076 | 591 | { |
ca577779 | 592 | range_seg_max_t rsearch; |
93cf2076 GW |
593 | uint64_t end = start + size; |
594 | ||
93cf2076 GW |
595 | VERIFY(size != 0); |
596 | ||
ca577779 PD |
597 | rs_set_start(&rsearch, rt, start); |
598 | rs_set_end(&rsearch, rt, end); | |
599 | return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); | |
9bd274dd | 600 | } |
93cf2076 | 601 | |
d4a72f23 | 602 | range_seg_t * |
9bd274dd MA |
603 | range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) |
604 | { | |
ca577779 PD |
605 | if (rt->rt_type == RANGE_SEG64) |
606 | ASSERT3U(start + size, >, start); | |
607 | ||
9bd274dd | 608 | range_seg_t *rs = range_tree_find_impl(rt, start, size); |
ca577779 PD |
609 | if (rs != NULL && rs_get_start(rs, rt) <= start && |
610 | rs_get_end(rs, rt) >= start + size) { | |
93cf2076 | 611 | return (rs); |
ca577779 | 612 | } |
93cf2076 GW |
613 | return (NULL); |
614 | } | |
615 | ||
616 | void | |
df72b8be | 617 | range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) |
93cf2076 | 618 | { |
df72b8be | 619 | range_seg_t *rs = range_tree_find(rt, off, size); |
93cf2076 | 620 | if (rs != NULL) |
df72b8be | 621 | panic("segment already in tree; rs=%p", (void *)rs); |
93cf2076 GW |
622 | } |
623 | ||
624 | boolean_t | |
625 | range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) | |
626 | { | |
9bd274dd MA |
627 | return (range_tree_find(rt, start, size) != NULL); |
628 | } | |
93cf2076 | 629 | |
c81f1790 PD |
630 | /* |
631 | * Returns the first subset of the given range which overlaps with the range | |
632 | * tree. Returns true if there is a segment in the range, and false if there | |
633 | * isn't. | |
634 | */ | |
635 | boolean_t | |
636 | range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, | |
637 | uint64_t *ostart, uint64_t *osize) | |
638 | { | |
ca577779 PD |
639 | if (rt->rt_type == RANGE_SEG64) |
640 | ASSERT3U(start + size, >, start); | |
c81f1790 | 641 | |
ca577779 PD |
642 | range_seg_max_t rsearch; |
643 | rs_set_start(&rsearch, rt, start); | |
644 | rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); | |
645 | ||
646 | zfs_btree_index_t where; | |
647 | range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); | |
c81f1790 PD |
648 | if (rs != NULL) { |
649 | *ostart = start; | |
ca577779 | 650 | *osize = MIN(size, rs_get_end(rs, rt) - start); |
c81f1790 PD |
651 | return (B_TRUE); |
652 | } | |
653 | ||
ca577779 PD |
654 | rs = zfs_btree_next(&rt->rt_root, &where, &where); |
655 | if (rs == NULL || rs_get_start(rs, rt) > start + size) | |
c81f1790 PD |
656 | return (B_FALSE); |
657 | ||
ca577779 PD |
658 | *ostart = rs_get_start(rs, rt); |
659 | *osize = MIN(start + size, rs_get_end(rs, rt)) - | |
660 | rs_get_start(rs, rt); | |
c81f1790 PD |
661 | return (B_TRUE); |
662 | } | |
663 | ||
9bd274dd MA |
664 | /* |
665 | * Ensure that this range is not in the tree, regardless of whether | |
666 | * it is currently in the tree. | |
667 | */ | |
668 | void | |
669 | range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) | |
670 | { | |
671 | range_seg_t *rs; | |
672 | ||
a1d477c2 MA |
673 | if (size == 0) |
674 | return; | |
675 | ||
ca577779 PD |
676 | if (rt->rt_type == RANGE_SEG64) |
677 | ASSERT3U(start + size, >, start); | |
678 | ||
9bd274dd | 679 | while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { |
ca577779 PD |
680 | uint64_t free_start = MAX(rs_get_start(rs, rt), start); |
681 | uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); | |
9bd274dd MA |
682 | range_tree_remove(rt, free_start, free_end - free_start); |
683 | } | |
93cf2076 GW |
684 | } |
685 | ||
686 | void | |
687 | range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) | |
688 | { | |
689 | range_tree_t *rt; | |
690 | ||
93cf2076 | 691 | ASSERT0(range_tree_space(*rtdst)); |
ca577779 | 692 | ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); |
93cf2076 GW |
693 | |
694 | rt = *rtsrc; | |
695 | *rtsrc = *rtdst; | |
696 | *rtdst = rt; | |
697 | } | |
698 | ||
699 | void | |
700 | range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) | |
701 | { | |
d4a72f23 | 702 | if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) |
93cf2076 GW |
703 | rt->rt_ops->rtop_vacate(rt, rt->rt_arg); |
704 | ||
ca577779 PD |
705 | if (func != NULL) { |
706 | range_seg_t *rs; | |
707 | zfs_btree_index_t *cookie = NULL; | |
708 | ||
709 | while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != | |
710 | NULL) { | |
711 | func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - | |
712 | rs_get_start(rs, rt)); | |
713 | } | |
714 | } else { | |
715 | zfs_btree_clear(&rt->rt_root); | |
93cf2076 GW |
716 | } |
717 | ||
861166b0 | 718 | memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram)); |
93cf2076 GW |
719 | rt->rt_space = 0; |
720 | } | |
721 | ||
722 | void | |
723 | range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) | |
724 | { | |
ca577779 PD |
725 | zfs_btree_index_t where; |
726 | for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); | |
727 | rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { | |
728 | func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - | |
729 | rs_get_start(rs, rt)); | |
93e28d66 | 730 | } |
93cf2076 GW |
731 | } |
732 | ||
d4a72f23 TC |
733 | range_seg_t * |
734 | range_tree_first(range_tree_t *rt) | |
735 | { | |
ca577779 | 736 | return (zfs_btree_first(&rt->rt_root, NULL)); |
d4a72f23 TC |
737 | } |
738 | ||
93cf2076 GW |
739 | uint64_t |
740 | range_tree_space(range_tree_t *rt) | |
741 | { | |
742 | return (rt->rt_space); | |
743 | } | |
d4a72f23 | 744 | |
93e28d66 SD |
745 | uint64_t |
746 | range_tree_numsegs(range_tree_t *rt) | |
747 | { | |
ca577779 | 748 | return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); |
93e28d66 SD |
749 | } |
750 | ||
0dc2f70c MA |
751 | boolean_t |
752 | range_tree_is_empty(range_tree_t *rt) | |
753 | { | |
754 | ASSERT(rt != NULL); | |
755 | return (range_tree_space(rt) == 0); | |
756 | } | |
757 | ||
93e28d66 SD |
758 | /* |
759 | * Remove any overlapping ranges between the given segment [start, end) | |
760 | * from removefrom. Add non-overlapping leftovers to addto. | |
761 | */ | |
762 | void | |
763 | range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, | |
764 | range_tree_t *removefrom, range_tree_t *addto) | |
765 | { | |
ca577779 PD |
766 | zfs_btree_index_t where; |
767 | range_seg_max_t starting_rs; | |
768 | rs_set_start(&starting_rs, removefrom, start); | |
769 | rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, | |
770 | removefrom) + 1); | |
93e28d66 | 771 | |
ca577779 | 772 | range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, |
93e28d66 SD |
773 | &starting_rs, &where); |
774 | ||
775 | if (curr == NULL) | |
ca577779 | 776 | curr = zfs_btree_next(&removefrom->rt_root, &where, &where); |
93e28d66 SD |
777 | |
778 | range_seg_t *next; | |
779 | for (; curr != NULL; curr = next) { | |
93e28d66 SD |
780 | if (start == end) |
781 | return; | |
782 | VERIFY3U(start, <, end); | |
783 | ||
784 | /* there is no overlap */ | |
ca577779 | 785 | if (end <= rs_get_start(curr, removefrom)) { |
93e28d66 SD |
786 | range_tree_add(addto, start, end - start); |
787 | return; | |
788 | } | |
789 | ||
ca577779 PD |
790 | uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), |
791 | start); | |
792 | uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), | |
793 | end); | |
93e28d66 SD |
794 | uint64_t overlap_size = overlap_end - overlap_start; |
795 | ASSERT3S(overlap_size, >, 0); | |
ca577779 PD |
796 | range_seg_max_t rs; |
797 | rs_copy(curr, &rs, removefrom); | |
798 | ||
93e28d66 SD |
799 | range_tree_remove(removefrom, overlap_start, overlap_size); |
800 | ||
801 | if (start < overlap_start) | |
802 | range_tree_add(addto, start, overlap_start - start); | |
803 | ||
804 | start = overlap_end; | |
ca577779 PD |
805 | next = zfs_btree_find(&removefrom->rt_root, &rs, &where); |
806 | /* | |
807 | * If we find something here, we only removed part of the | |
808 | * curr segment. Either there's some left at the end | |
809 | * because we've reached the end of the range we're removing, | |
810 | * or there's some left at the start because we started | |
811 | * partway through the range. Either way, we continue with | |
812 | * the loop. If it's the former, we'll return at the start of | |
813 | * the loop, and if it's the latter we'll see if there is more | |
814 | * area to process. | |
815 | */ | |
816 | if (next != NULL) { | |
817 | ASSERT(start == end || start == rs_get_end(&rs, | |
818 | removefrom)); | |
819 | } | |
820 | ||
821 | next = zfs_btree_next(&removefrom->rt_root, &where, &where); | |
93e28d66 SD |
822 | } |
823 | VERIFY3P(curr, ==, NULL); | |
824 | ||
825 | if (start != end) { | |
826 | VERIFY3U(start, <, end); | |
827 | range_tree_add(addto, start, end - start); | |
828 | } else { | |
829 | VERIFY3U(start, ==, end); | |
830 | } | |
831 | } | |
832 | ||
833 | /* | |
834 | * For each entry in rt, if it exists in removefrom, remove it | |
835 | * from removefrom. Otherwise, add it to addto. | |
836 | */ | |
837 | void | |
838 | range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, | |
839 | range_tree_t *addto) | |
840 | { | |
ca577779 PD |
841 | zfs_btree_index_t where; |
842 | for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; | |
843 | rs = zfs_btree_next(&rt->rt_root, &where, &where)) { | |
844 | range_tree_remove_xor_add_segment(rs_get_start(rs, rt), | |
845 | rs_get_end(rs, rt), removefrom, addto); | |
93e28d66 SD |
846 | } |
847 | } | |
ca577779 PD |
848 | |
849 | uint64_t | |
850 | range_tree_min(range_tree_t *rt) | |
851 | { | |
852 | range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); | |
853 | return (rs != NULL ? rs_get_start(rs, rt) : 0); | |
854 | } | |
855 | ||
856 | uint64_t | |
857 | range_tree_max(range_tree_t *rt) | |
858 | { | |
859 | range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); | |
860 | return (rs != NULL ? rs_get_end(rs, rt) : 0); | |
861 | } | |
862 | ||
863 | uint64_t | |
864 | range_tree_span(range_tree_t *rt) | |
865 | { | |
866 | return (range_tree_max(rt) - range_tree_min(rt)); | |
867 | } |