4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
32 #include <sys/dnode.h>
34 #include <sys/range_tree.h>
37 * Range trees are tree-based data structures that can be used to
38 * track free space or generally any space allocation information.
39 * A range tree keeps track of individual segments and automatically
40 * provides facilities such as adjacent extent merging and extent
41 * splitting in response to range add/remove requests.
43 * A range tree starts out completely empty, with no segments in it.
44 * Adding an allocation via range_tree_add to the range tree can either:
45 * 1) create a new extent
46 * 2) extend an adjacent extent
47 * 3) merge two adjacent extents
48 * Conversely, removing an allocation via range_tree_remove can:
49 * 1) completely remove an extent
50 * 2) shorten an extent (if the allocation was near one of its ends)
51 * 3) split an extent into two extents, in effect punching a hole
53 * A range tree is also capable of 'bridging' gaps when adding
54 * allocations. This is useful for cases when close proximity of
55 * allocations is an important detail that needs to be represented
56 * in the range tree. See range_tree_set_gap(). The default behavior
57 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
59 * In order to traverse a range tree, use either the range_tree_walk()
60 * or range_tree_vacate() functions.
62 * To obtain more accurate information on individual segment
63 * operations that the range tree performs "under the hood", you can
64 * specify a set of callbacks by passing a range_tree_ops_t structure
65 * to the range_tree_create function. Any callbacks that are non-NULL
66 * are then called at the appropriate times.
68 * The range tree code also supports a special variant of range trees
69 * that can bridge small gaps between segments. This kind of tree is used
70 * by the dsl scanning code to group I/Os into mostly sequential chunks to
71 * optimize disk performance. The code here attempts to do this with as
72 * little memory and computational overhead as possible. One limitation of
73 * this implementation is that segments of range trees with gaps can only
74 * support removing complete segments.
77 kmem_cache_t
*range_seg_cache
;
79 /* Generic ops for managing an AVL tree alongside a range tree */
80 struct range_tree_ops rt_avl_ops
= {
81 .rtop_create
= rt_avl_create
,
82 .rtop_destroy
= rt_avl_destroy
,
83 .rtop_add
= rt_avl_add
,
84 .rtop_remove
= rt_avl_remove
,
85 .rtop_vacate
= rt_avl_vacate
,
91 ASSERT(range_seg_cache
== NULL
);
92 range_seg_cache
= kmem_cache_create("range_seg_cache",
93 sizeof (range_seg_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
99 kmem_cache_destroy(range_seg_cache
);
100 range_seg_cache
= NULL
;
104 range_tree_stat_verify(range_tree_t
*rt
)
107 uint64_t hist
[RANGE_TREE_HISTOGRAM_SIZE
] = { 0 };
110 for (rs
= avl_first(&rt
->rt_root
); rs
!= NULL
;
111 rs
= AVL_NEXT(&rt
->rt_root
, rs
)) {
112 uint64_t size
= rs
->rs_end
- rs
->rs_start
;
113 int idx
= highbit64(size
) - 1;
116 ASSERT3U(hist
[idx
], !=, 0);
119 for (i
= 0; i
< RANGE_TREE_HISTOGRAM_SIZE
; i
++) {
120 if (hist
[i
] != rt
->rt_histogram
[i
]) {
121 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
122 i
, hist
, hist
[i
], rt
->rt_histogram
[i
]);
124 VERIFY3U(hist
[i
], ==, rt
->rt_histogram
[i
]);
129 range_tree_stat_incr(range_tree_t
*rt
, range_seg_t
*rs
)
131 uint64_t size
= rs
->rs_end
- rs
->rs_start
;
132 int idx
= highbit64(size
) - 1;
136 sizeof (rt
->rt_histogram
) / sizeof (*rt
->rt_histogram
));
138 rt
->rt_histogram
[idx
]++;
139 ASSERT3U(rt
->rt_histogram
[idx
], !=, 0);
143 range_tree_stat_decr(range_tree_t
*rt
, range_seg_t
*rs
)
145 uint64_t size
= rs
->rs_end
- rs
->rs_start
;
146 int idx
= highbit64(size
) - 1;
150 sizeof (rt
->rt_histogram
) / sizeof (*rt
->rt_histogram
));
152 ASSERT3U(rt
->rt_histogram
[idx
], !=, 0);
153 rt
->rt_histogram
[idx
]--;
157 * NOTE: caller is responsible for all locking.
160 range_tree_seg_compare(const void *x1
, const void *x2
)
162 const range_seg_t
*r1
= (const range_seg_t
*)x1
;
163 const range_seg_t
*r2
= (const range_seg_t
*)x2
;
165 ASSERT3U(r1
->rs_start
, <=, r1
->rs_end
);
166 ASSERT3U(r2
->rs_start
, <=, r2
->rs_end
);
168 return ((r1
->rs_start
>= r2
->rs_end
) - (r1
->rs_end
<= r2
->rs_start
));
172 range_tree_create_impl(range_tree_ops_t
*ops
, void *arg
,
173 int (*avl_compare
) (const void *, const void *), uint64_t gap
)
175 range_tree_t
*rt
= kmem_zalloc(sizeof (range_tree_t
), KM_SLEEP
);
177 avl_create(&rt
->rt_root
, range_tree_seg_compare
,
178 sizeof (range_seg_t
), offsetof(range_seg_t
, rs_node
));
183 rt
->rt_avl_compare
= avl_compare
;
185 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_create
!= NULL
)
186 rt
->rt_ops
->rtop_create(rt
, rt
->rt_arg
);
192 range_tree_create(range_tree_ops_t
*ops
, void *arg
)
194 return (range_tree_create_impl(ops
, arg
, NULL
, 0));
198 range_tree_destroy(range_tree_t
*rt
)
200 VERIFY0(rt
->rt_space
);
202 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_destroy
!= NULL
)
203 rt
->rt_ops
->rtop_destroy(rt
, rt
->rt_arg
);
205 avl_destroy(&rt
->rt_root
);
206 kmem_free(rt
, sizeof (*rt
));
210 range_tree_adjust_fill(range_tree_t
*rt
, range_seg_t
*rs
, int64_t delta
)
212 ASSERT3U(rs
->rs_fill
+ delta
, !=, 0);
213 ASSERT3U(rs
->rs_fill
+ delta
, <=, rs
->rs_end
- rs
->rs_start
);
215 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
)
216 rt
->rt_ops
->rtop_remove(rt
, rs
, rt
->rt_arg
);
217 rs
->rs_fill
+= delta
;
218 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_add
!= NULL
)
219 rt
->rt_ops
->rtop_add(rt
, rs
, rt
->rt_arg
);
223 range_tree_add_impl(void *arg
, uint64_t start
, uint64_t size
, uint64_t fill
)
225 range_tree_t
*rt
= arg
;
227 range_seg_t rsearch
, *rs_before
, *rs_after
, *rs
;
228 uint64_t end
= start
+ size
, gap
= rt
->rt_gap
;
229 uint64_t bridge_size
= 0;
230 boolean_t merge_before
, merge_after
;
232 ASSERT3U(size
, !=, 0);
233 ASSERT3U(fill
, <=, size
);
235 rsearch
.rs_start
= start
;
236 rsearch
.rs_end
= end
;
237 rs
= avl_find(&rt
->rt_root
, &rsearch
, &where
);
239 if (gap
== 0 && rs
!= NULL
&&
240 rs
->rs_start
<= start
&& rs
->rs_end
>= end
) {
241 zfs_panic_recover("zfs: allocating allocated segment"
242 "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
243 (longlong_t
)start
, (longlong_t
)size
,
244 (longlong_t
)rs
->rs_start
,
245 (longlong_t
)rs
->rs_end
- rs
->rs_start
);
250 * If this is a gap-supporting range tree, it is possible that we
251 * are inserting into an existing segment. In this case simply
252 * bump the fill count and call the remove / add callbacks. If the
253 * new range will extend an existing segment, we remove the
254 * existing one, apply the new extent to it and re-insert it using
255 * the normal code paths.
258 ASSERT3U(gap
, !=, 0);
259 if (rs
->rs_start
<= start
&& rs
->rs_end
>= end
) {
260 range_tree_adjust_fill(rt
, rs
, fill
);
264 avl_remove(&rt
->rt_root
, rs
);
265 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
)
266 rt
->rt_ops
->rtop_remove(rt
, rs
, rt
->rt_arg
);
268 range_tree_stat_decr(rt
, rs
);
269 rt
->rt_space
-= rs
->rs_end
- rs
->rs_start
;
272 start
= MIN(start
, rs
->rs_start
);
273 end
= MAX(end
, rs
->rs_end
);
276 range_tree_add_impl(rt
, start
, size
, fill
);
278 kmem_cache_free(range_seg_cache
, rs
);
282 ASSERT3P(rs
, ==, NULL
);
285 * Determine whether or not we will have to merge with our neighbors.
286 * If gap != 0, we might need to merge with our neighbors even if we
287 * aren't directly touching.
289 rs_before
= avl_nearest(&rt
->rt_root
, where
, AVL_BEFORE
);
290 rs_after
= avl_nearest(&rt
->rt_root
, where
, AVL_AFTER
);
292 merge_before
= (rs_before
!= NULL
&& rs_before
->rs_end
>= start
- gap
);
293 merge_after
= (rs_after
!= NULL
&& rs_after
->rs_start
<= end
+ gap
);
295 if (merge_before
&& gap
!= 0)
296 bridge_size
+= start
- rs_before
->rs_end
;
297 if (merge_after
&& gap
!= 0)
298 bridge_size
+= rs_after
->rs_start
- end
;
300 if (merge_before
&& merge_after
) {
301 avl_remove(&rt
->rt_root
, rs_before
);
302 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
) {
303 rt
->rt_ops
->rtop_remove(rt
, rs_before
, rt
->rt_arg
);
304 rt
->rt_ops
->rtop_remove(rt
, rs_after
, rt
->rt_arg
);
307 range_tree_stat_decr(rt
, rs_before
);
308 range_tree_stat_decr(rt
, rs_after
);
310 rs_after
->rs_fill
+= rs_before
->rs_fill
+ fill
;
311 rs_after
->rs_start
= rs_before
->rs_start
;
312 kmem_cache_free(range_seg_cache
, rs_before
);
314 } else if (merge_before
) {
315 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
)
316 rt
->rt_ops
->rtop_remove(rt
, rs_before
, rt
->rt_arg
);
318 range_tree_stat_decr(rt
, rs_before
);
320 rs_before
->rs_fill
+= fill
;
321 rs_before
->rs_end
= end
;
323 } else if (merge_after
) {
324 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
)
325 rt
->rt_ops
->rtop_remove(rt
, rs_after
, rt
->rt_arg
);
327 range_tree_stat_decr(rt
, rs_after
);
329 rs_after
->rs_fill
+= fill
;
330 rs_after
->rs_start
= start
;
333 rs
= kmem_cache_alloc(range_seg_cache
, KM_SLEEP
);
336 rs
->rs_start
= start
;
338 avl_insert(&rt
->rt_root
, rs
, where
);
342 ASSERT3U(rs
->rs_fill
, <=, rs
->rs_end
- rs
->rs_start
);
344 ASSERT3U(rs
->rs_fill
, ==, rs
->rs_end
- rs
->rs_start
);
346 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_add
!= NULL
)
347 rt
->rt_ops
->rtop_add(rt
, rs
, rt
->rt_arg
);
349 range_tree_stat_incr(rt
, rs
);
350 rt
->rt_space
+= size
+ bridge_size
;
354 range_tree_add(void *arg
, uint64_t start
, uint64_t size
)
356 range_tree_add_impl(arg
, start
, size
, size
);
360 range_tree_remove_impl(range_tree_t
*rt
, uint64_t start
, uint64_t size
,
364 range_seg_t rsearch
, *rs
, *newseg
;
365 uint64_t end
= start
+ size
;
366 boolean_t left_over
, right_over
;
368 VERIFY3U(size
, !=, 0);
369 VERIFY3U(size
, <=, rt
->rt_space
);
371 rsearch
.rs_start
= start
;
372 rsearch
.rs_end
= end
;
373 rs
= avl_find(&rt
->rt_root
, &rsearch
, &where
);
375 /* Make sure we completely overlap with someone */
377 zfs_panic_recover("zfs: freeing free segment "
378 "(offset=%llu size=%llu)",
379 (longlong_t
)start
, (longlong_t
)size
);
384 * Range trees with gap support must only remove complete segments
385 * from the tree. This allows us to maintain accurate fill accounting
386 * and to ensure that bridged sections are not leaked. If we need to
387 * remove less than the full segment, we can only adjust the fill count.
389 if (rt
->rt_gap
!= 0) {
391 if (rs
->rs_fill
== size
) {
392 start
= rs
->rs_start
;
396 range_tree_adjust_fill(rt
, rs
, -size
);
399 } else if (rs
->rs_start
!= start
|| rs
->rs_end
!= end
) {
400 zfs_panic_recover("zfs: freeing partial segment of "
401 "gap tree (offset=%llu size=%llu) of "
402 "(offset=%llu size=%llu)",
403 (longlong_t
)start
, (longlong_t
)size
,
404 (longlong_t
)rs
->rs_start
,
405 (longlong_t
)rs
->rs_end
- rs
->rs_start
);
410 VERIFY3U(rs
->rs_start
, <=, start
);
411 VERIFY3U(rs
->rs_end
, >=, end
);
413 left_over
= (rs
->rs_start
!= start
);
414 right_over
= (rs
->rs_end
!= end
);
416 range_tree_stat_decr(rt
, rs
);
418 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
)
419 rt
->rt_ops
->rtop_remove(rt
, rs
, rt
->rt_arg
);
421 if (left_over
&& right_over
) {
422 newseg
= kmem_cache_alloc(range_seg_cache
, KM_SLEEP
);
423 newseg
->rs_start
= end
;
424 newseg
->rs_end
= rs
->rs_end
;
425 newseg
->rs_fill
= newseg
->rs_end
- newseg
->rs_start
;
426 range_tree_stat_incr(rt
, newseg
);
430 avl_insert_here(&rt
->rt_root
, newseg
, rs
, AVL_AFTER
);
431 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_add
!= NULL
)
432 rt
->rt_ops
->rtop_add(rt
, newseg
, rt
->rt_arg
);
433 } else if (left_over
) {
435 } else if (right_over
) {
438 avl_remove(&rt
->rt_root
, rs
);
439 kmem_cache_free(range_seg_cache
, rs
);
445 * The fill of the leftover segment will always be equal to
446 * the size, since we do not support removing partial segments
447 * of range trees with gaps.
449 rs
->rs_fill
= rs
->rs_end
- rs
->rs_start
;
450 range_tree_stat_incr(rt
, rs
);
452 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_add
!= NULL
)
453 rt
->rt_ops
->rtop_add(rt
, rs
, rt
->rt_arg
);
456 rt
->rt_space
-= size
;
460 range_tree_remove(void *arg
, uint64_t start
, uint64_t size
)
462 range_tree_remove_impl(arg
, start
, size
, B_FALSE
);
466 range_tree_remove_fill(range_tree_t
*rt
, uint64_t start
, uint64_t size
)
468 range_tree_remove_impl(rt
, start
, size
, B_TRUE
);
472 range_tree_resize_segment(range_tree_t
*rt
, range_seg_t
*rs
,
473 uint64_t newstart
, uint64_t newsize
)
475 int64_t delta
= newsize
- (rs
->rs_end
- rs
->rs_start
);
477 range_tree_stat_decr(rt
, rs
);
478 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_remove
!= NULL
)
479 rt
->rt_ops
->rtop_remove(rt
, rs
, rt
->rt_arg
);
481 rs
->rs_start
= newstart
;
482 rs
->rs_end
= newstart
+ newsize
;
484 range_tree_stat_incr(rt
, rs
);
485 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_add
!= NULL
)
486 rt
->rt_ops
->rtop_add(rt
, rs
, rt
->rt_arg
);
488 rt
->rt_space
+= delta
;
492 range_tree_find_impl(range_tree_t
*rt
, uint64_t start
, uint64_t size
)
495 uint64_t end
= start
+ size
;
499 rsearch
.rs_start
= start
;
500 rsearch
.rs_end
= end
;
501 return (avl_find(&rt
->rt_root
, &rsearch
, NULL
));
505 range_tree_find(range_tree_t
*rt
, uint64_t start
, uint64_t size
)
507 range_seg_t
*rs
= range_tree_find_impl(rt
, start
, size
);
508 if (rs
!= NULL
&& rs
->rs_start
<= start
&& rs
->rs_end
>= start
+ size
)
514 range_tree_verify(range_tree_t
*rt
, uint64_t off
, uint64_t size
)
518 rs
= range_tree_find(rt
, off
, size
);
520 panic("freeing free block; rs=%p", (void *)rs
);
524 range_tree_contains(range_tree_t
*rt
, uint64_t start
, uint64_t size
)
526 return (range_tree_find(rt
, start
, size
) != NULL
);
530 * Ensure that this range is not in the tree, regardless of whether
531 * it is currently in the tree.
534 range_tree_clear(range_tree_t
*rt
, uint64_t start
, uint64_t size
)
541 while ((rs
= range_tree_find_impl(rt
, start
, size
)) != NULL
) {
542 uint64_t free_start
= MAX(rs
->rs_start
, start
);
543 uint64_t free_end
= MIN(rs
->rs_end
, start
+ size
);
544 range_tree_remove(rt
, free_start
, free_end
- free_start
);
549 range_tree_swap(range_tree_t
**rtsrc
, range_tree_t
**rtdst
)
553 ASSERT0(range_tree_space(*rtdst
));
554 ASSERT0(avl_numnodes(&(*rtdst
)->rt_root
));
562 range_tree_vacate(range_tree_t
*rt
, range_tree_func_t
*func
, void *arg
)
567 if (rt
->rt_ops
!= NULL
&& rt
->rt_ops
->rtop_vacate
!= NULL
)
568 rt
->rt_ops
->rtop_vacate(rt
, rt
->rt_arg
);
570 while ((rs
= avl_destroy_nodes(&rt
->rt_root
, &cookie
)) != NULL
) {
572 func(arg
, rs
->rs_start
, rs
->rs_end
- rs
->rs_start
);
573 kmem_cache_free(range_seg_cache
, rs
);
576 bzero(rt
->rt_histogram
, sizeof (rt
->rt_histogram
));
581 range_tree_walk(range_tree_t
*rt
, range_tree_func_t
*func
, void *arg
)
585 for (rs
= avl_first(&rt
->rt_root
); rs
; rs
= AVL_NEXT(&rt
->rt_root
, rs
))
586 func(arg
, rs
->rs_start
, rs
->rs_end
- rs
->rs_start
);
590 range_tree_first(range_tree_t
*rt
)
592 return (avl_first(&rt
->rt_root
));
596 range_tree_space(range_tree_t
*rt
)
598 return (rt
->rt_space
);
602 range_tree_is_empty(range_tree_t
*rt
)
605 return (range_tree_space(rt
) == 0);
608 /* Generic range tree functions for maintaining segments in an AVL tree. */
610 rt_avl_create(range_tree_t
*rt
, void *arg
)
612 avl_tree_t
*tree
= arg
;
614 avl_create(tree
, rt
->rt_avl_compare
, sizeof (range_seg_t
),
615 offsetof(range_seg_t
, rs_pp_node
));
619 rt_avl_destroy(range_tree_t
*rt
, void *arg
)
621 avl_tree_t
*tree
= arg
;
623 ASSERT0(avl_numnodes(tree
));
628 rt_avl_add(range_tree_t
*rt
, range_seg_t
*rs
, void *arg
)
630 avl_tree_t
*tree
= arg
;
635 rt_avl_remove(range_tree_t
*rt
, range_seg_t
*rs
, void *arg
)
637 avl_tree_t
*tree
= arg
;
638 avl_remove(tree
, rs
);
642 rt_avl_vacate(range_tree_t
*rt
, void *arg
)
645 * Normally one would walk the tree freeing nodes along the way.
646 * Since the nodes are shared with the range trees we can avoid
647 * walking all nodes and just reinitialize the avl tree. The nodes
648 * will be freed by the range tree, so we don't want to free them here.
650 rt_avl_create(rt
, arg
);
654 range_tree_min(range_tree_t
*rt
)
656 range_seg_t
*rs
= avl_first(&rt
->rt_root
);
657 return (rs
!= NULL
? rs
->rs_start
: 0);
661 range_tree_max(range_tree_t
*rt
)
663 range_seg_t
*rs
= avl_last(&rt
->rt_root
);
664 return (rs
!= NULL
? rs
->rs_end
: 0);
668 range_tree_span(range_tree_t
*rt
)
670 return (range_tree_max(rt
) - range_tree_min(rt
));