]>
git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/space_map.c
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 * Copyright (c) 2012 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
33 #include <sys/space_map.h>
35 static kmem_cache_t
*space_seg_cache
;
40 ASSERT(space_seg_cache
== NULL
);
41 space_seg_cache
= kmem_cache_create("space_seg_cache",
42 sizeof (space_seg_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
48 kmem_cache_destroy(space_seg_cache
);
49 space_seg_cache
= NULL
;
54 * NOTE: caller is responsible for all locking.
57 space_map_seg_compare(const void *x1
, const void *x2
)
59 const space_seg_t
*s1
= x1
;
60 const space_seg_t
*s2
= x2
;
62 if (s1
->ss_start
< s2
->ss_start
) {
63 if (s1
->ss_end
> s2
->ss_start
)
67 if (s1
->ss_start
> s2
->ss_start
) {
68 if (s1
->ss_start
< s2
->ss_end
)
76 space_map_create(space_map_t
*sm
, uint64_t start
, uint64_t size
, uint8_t shift
,
79 bzero(sm
, sizeof (*sm
));
81 cv_init(&sm
->sm_load_cv
, NULL
, CV_DEFAULT
, NULL
);
83 avl_create(&sm
->sm_root
, space_map_seg_compare
,
84 sizeof (space_seg_t
), offsetof(struct space_seg
, ss_node
));
93 space_map_destroy(space_map_t
*sm
)
95 ASSERT(!sm
->sm_loaded
&& !sm
->sm_loading
);
96 VERIFY0(sm
->sm_space
);
97 avl_destroy(&sm
->sm_root
);
98 cv_destroy(&sm
->sm_load_cv
);
102 space_map_add(space_map_t
*sm
, uint64_t start
, uint64_t size
)
105 space_seg_t
*ss_before
, *ss_after
, *ss
;
106 uint64_t end
= start
+ size
;
107 int merge_before
, merge_after
;
109 ASSERT(MUTEX_HELD(sm
->sm_lock
));
110 VERIFY(!sm
->sm_condensing
);
112 VERIFY3U(start
, >=, sm
->sm_start
);
113 VERIFY3U(end
, <=, sm
->sm_start
+ sm
->sm_size
);
114 VERIFY(sm
->sm_space
+ size
<= sm
->sm_size
);
115 VERIFY(P2PHASE(start
, 1ULL << sm
->sm_shift
) == 0);
116 VERIFY(P2PHASE(size
, 1ULL << sm
->sm_shift
) == 0);
118 ss
= space_map_find(sm
, start
, size
, &where
);
120 zfs_panic_recover("zfs: allocating allocated segment"
121 "(offset=%llu size=%llu)\n",
122 (longlong_t
)start
, (longlong_t
)size
);
126 /* Make sure we don't overlap with either of our neighbors */
129 ss_before
= avl_nearest(&sm
->sm_root
, where
, AVL_BEFORE
);
130 ss_after
= avl_nearest(&sm
->sm_root
, where
, AVL_AFTER
);
132 merge_before
= (ss_before
!= NULL
&& ss_before
->ss_end
== start
);
133 merge_after
= (ss_after
!= NULL
&& ss_after
->ss_start
== end
);
135 if (merge_before
&& merge_after
) {
136 avl_remove(&sm
->sm_root
, ss_before
);
137 if (sm
->sm_pp_root
) {
138 avl_remove(sm
->sm_pp_root
, ss_before
);
139 avl_remove(sm
->sm_pp_root
, ss_after
);
141 ss_after
->ss_start
= ss_before
->ss_start
;
142 kmem_cache_free(space_seg_cache
, ss_before
);
144 } else if (merge_before
) {
145 ss_before
->ss_end
= end
;
147 avl_remove(sm
->sm_pp_root
, ss_before
);
149 } else if (merge_after
) {
150 ss_after
->ss_start
= start
;
152 avl_remove(sm
->sm_pp_root
, ss_after
);
155 ss
= kmem_cache_alloc(space_seg_cache
, KM_PUSHPAGE
);
156 ss
->ss_start
= start
;
158 avl_insert(&sm
->sm_root
, ss
, where
);
162 avl_add(sm
->sm_pp_root
, ss
);
164 sm
->sm_space
+= size
;
168 space_map_remove(space_map_t
*sm
, uint64_t start
, uint64_t size
)
171 space_seg_t
*ss
, *newseg
;
172 uint64_t end
= start
+ size
;
173 int left_over
, right_over
;
175 VERIFY(!sm
->sm_condensing
);
176 ss
= space_map_find(sm
, start
, size
, &where
);
178 /* Make sure we completely overlap with someone */
180 zfs_panic_recover("zfs: freeing free segment "
181 "(offset=%llu size=%llu)",
182 (longlong_t
)start
, (longlong_t
)size
);
185 VERIFY3U(ss
->ss_start
, <=, start
);
186 VERIFY3U(ss
->ss_end
, >=, end
);
187 VERIFY(sm
->sm_space
- size
<= sm
->sm_size
);
189 left_over
= (ss
->ss_start
!= start
);
190 right_over
= (ss
->ss_end
!= end
);
193 avl_remove(sm
->sm_pp_root
, ss
);
195 if (left_over
&& right_over
) {
196 newseg
= kmem_cache_alloc(space_seg_cache
, KM_PUSHPAGE
);
197 newseg
->ss_start
= end
;
198 newseg
->ss_end
= ss
->ss_end
;
200 avl_insert_here(&sm
->sm_root
, newseg
, ss
, AVL_AFTER
);
202 avl_add(sm
->sm_pp_root
, newseg
);
203 } else if (left_over
) {
205 } else if (right_over
) {
208 avl_remove(&sm
->sm_root
, ss
);
209 kmem_cache_free(space_seg_cache
, ss
);
213 if (sm
->sm_pp_root
&& ss
!= NULL
)
214 avl_add(sm
->sm_pp_root
, ss
);
216 sm
->sm_space
-= size
;
220 space_map_find(space_map_t
*sm
, uint64_t start
, uint64_t size
,
223 space_seg_t ssearch
, *ss
;
225 ASSERT(MUTEX_HELD(sm
->sm_lock
));
227 VERIFY(P2PHASE(start
, 1ULL << sm
->sm_shift
) == 0);
228 VERIFY(P2PHASE(size
, 1ULL << sm
->sm_shift
) == 0);
230 ssearch
.ss_start
= start
;
231 ssearch
.ss_end
= start
+ size
;
232 ss
= avl_find(&sm
->sm_root
, &ssearch
, wherep
);
234 if (ss
!= NULL
&& ss
->ss_start
<= start
&& ss
->ss_end
>= start
+ size
)
240 space_map_contains(space_map_t
*sm
, uint64_t start
, uint64_t size
)
244 return (space_map_find(sm
, start
, size
, &where
) != 0);
248 space_map_swap(space_map_t
**msrc
, space_map_t
**mdst
)
252 ASSERT(MUTEX_HELD((*msrc
)->sm_lock
));
253 ASSERT0((*mdst
)->sm_space
);
254 ASSERT0(avl_numnodes(&(*mdst
)->sm_root
));
262 space_map_vacate(space_map_t
*sm
, space_map_func_t
*func
, space_map_t
*mdest
)
267 ASSERT(MUTEX_HELD(sm
->sm_lock
));
269 while ((ss
= avl_destroy_nodes(&sm
->sm_root
, &cookie
)) != NULL
) {
271 func(mdest
, ss
->ss_start
, ss
->ss_end
- ss
->ss_start
);
272 kmem_cache_free(space_seg_cache
, ss
);
278 space_map_walk(space_map_t
*sm
, space_map_func_t
*func
, space_map_t
*mdest
)
282 ASSERT(MUTEX_HELD(sm
->sm_lock
));
284 for (ss
= avl_first(&sm
->sm_root
); ss
; ss
= AVL_NEXT(&sm
->sm_root
, ss
))
285 func(mdest
, ss
->ss_start
, ss
->ss_end
- ss
->ss_start
);
289 * Wait for any in-progress space_map_load() to complete.
292 space_map_load_wait(space_map_t
*sm
)
294 ASSERT(MUTEX_HELD(sm
->sm_lock
));
296 while (sm
->sm_loading
) {
297 ASSERT(!sm
->sm_loaded
);
298 cv_wait(&sm
->sm_load_cv
, sm
->sm_lock
);
303 * Note: space_map_load() will drop sm_lock across dmu_read() calls.
304 * The caller must be OK with this.
307 space_map_load(space_map_t
*sm
, space_map_ops_t
*ops
, uint8_t maptype
,
308 space_map_obj_t
*smo
, objset_t
*os
)
310 uint64_t *entry
, *entry_map
, *entry_map_end
;
311 uint64_t bufsize
, size
, offset
, end
, space
;
312 uint64_t mapstart
= sm
->sm_start
;
315 ASSERT(MUTEX_HELD(sm
->sm_lock
));
316 ASSERT(!sm
->sm_loaded
);
317 ASSERT(!sm
->sm_loading
);
319 sm
->sm_loading
= B_TRUE
;
320 end
= smo
->smo_objsize
;
321 space
= smo
->smo_alloc
;
323 ASSERT(sm
->sm_ops
== NULL
);
324 VERIFY0(sm
->sm_space
);
326 if (maptype
== SM_FREE
) {
327 space_map_add(sm
, sm
->sm_start
, sm
->sm_size
);
328 space
= sm
->sm_size
- space
;
331 bufsize
= 1ULL << SPACE_MAP_BLOCKSHIFT
;
332 entry_map
= zio_buf_alloc(bufsize
);
334 mutex_exit(sm
->sm_lock
);
336 dmu_prefetch(os
, smo
->smo_object
, bufsize
, end
- bufsize
);
337 mutex_enter(sm
->sm_lock
);
339 for (offset
= 0; offset
< end
; offset
+= bufsize
) {
340 size
= MIN(end
- offset
, bufsize
);
341 VERIFY(P2PHASE(size
, sizeof (uint64_t)) == 0);
344 dprintf("object=%llu offset=%llx size=%llx\n",
345 smo
->smo_object
, offset
, size
);
347 mutex_exit(sm
->sm_lock
);
348 error
= dmu_read(os
, smo
->smo_object
, offset
, size
, entry_map
,
350 mutex_enter(sm
->sm_lock
);
354 entry_map_end
= entry_map
+ (size
/ sizeof (uint64_t));
355 for (entry
= entry_map
; entry
< entry_map_end
; entry
++) {
358 if (SM_DEBUG_DECODE(e
)) /* Skip debug entries */
361 (SM_TYPE_DECODE(e
) == maptype
?
362 space_map_add
: space_map_remove
)(sm
,
363 (SM_OFFSET_DECODE(e
) << sm
->sm_shift
) + mapstart
,
364 SM_RUN_DECODE(e
) << sm
->sm_shift
);
369 VERIFY3U(sm
->sm_space
, ==, space
);
371 sm
->sm_loaded
= B_TRUE
;
376 space_map_vacate(sm
, NULL
, NULL
);
379 zio_buf_free(entry_map
, bufsize
);
381 sm
->sm_loading
= B_FALSE
;
383 cv_broadcast(&sm
->sm_load_cv
);
389 space_map_unload(space_map_t
*sm
)
391 ASSERT(MUTEX_HELD(sm
->sm_lock
));
393 if (sm
->sm_loaded
&& sm
->sm_ops
!= NULL
)
394 sm
->sm_ops
->smop_unload(sm
);
396 sm
->sm_loaded
= B_FALSE
;
399 space_map_vacate(sm
, NULL
, NULL
);
403 space_map_maxsize(space_map_t
*sm
)
405 ASSERT(sm
->sm_ops
!= NULL
);
406 return (sm
->sm_ops
->smop_max(sm
));
410 space_map_alloc(space_map_t
*sm
, uint64_t size
)
414 start
= sm
->sm_ops
->smop_alloc(sm
, size
);
416 space_map_remove(sm
, start
, size
);
421 space_map_claim(space_map_t
*sm
, uint64_t start
, uint64_t size
)
423 sm
->sm_ops
->smop_claim(sm
, start
, size
);
424 space_map_remove(sm
, start
, size
);
428 space_map_free(space_map_t
*sm
, uint64_t start
, uint64_t size
)
430 space_map_add(sm
, start
, size
);
431 sm
->sm_ops
->smop_free(sm
, start
, size
);
435 * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
438 space_map_sync(space_map_t
*sm
, uint8_t maptype
,
439 space_map_obj_t
*smo
, objset_t
*os
, dmu_tx_t
*tx
)
441 spa_t
*spa
= dmu_objset_spa(os
);
442 avl_tree_t
*t
= &sm
->sm_root
;
444 uint64_t bufsize
, start
, size
, run_len
, total
, sm_space
, nodes
;
445 uint64_t *entry
, *entry_map
, *entry_map_end
;
447 ASSERT(MUTEX_HELD(sm
->sm_lock
));
449 if (sm
->sm_space
== 0)
452 dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
453 smo
->smo_object
, dmu_tx_get_txg(tx
), spa_sync_pass(spa
),
454 maptype
== SM_ALLOC
? 'A' : 'F', avl_numnodes(&sm
->sm_root
),
457 if (maptype
== SM_ALLOC
)
458 smo
->smo_alloc
+= sm
->sm_space
;
460 smo
->smo_alloc
-= sm
->sm_space
;
462 bufsize
= (8 + avl_numnodes(&sm
->sm_root
)) * sizeof (uint64_t);
463 bufsize
= MIN(bufsize
, 1ULL << SPACE_MAP_BLOCKSHIFT
);
464 entry_map
= zio_buf_alloc(bufsize
);
465 entry_map_end
= entry_map
+ (bufsize
/ sizeof (uint64_t));
468 *entry
++ = SM_DEBUG_ENCODE(1) |
469 SM_DEBUG_ACTION_ENCODE(maptype
) |
470 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa
)) |
471 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx
));
474 nodes
= avl_numnodes(&sm
->sm_root
);
475 sm_space
= sm
->sm_space
;
476 for (ss
= avl_first(t
); ss
!= NULL
; ss
= AVL_NEXT(t
, ss
)) {
477 size
= ss
->ss_end
- ss
->ss_start
;
478 start
= (ss
->ss_start
- sm
->sm_start
) >> sm
->sm_shift
;
481 size
>>= sm
->sm_shift
;
484 run_len
= MIN(size
, SM_RUN_MAX
);
486 if (entry
== entry_map_end
) {
487 mutex_exit(sm
->sm_lock
);
488 dmu_write(os
, smo
->smo_object
, smo
->smo_objsize
,
489 bufsize
, entry_map
, tx
);
490 mutex_enter(sm
->sm_lock
);
491 smo
->smo_objsize
+= bufsize
;
495 *entry
++ = SM_OFFSET_ENCODE(start
) |
496 SM_TYPE_ENCODE(maptype
) |
497 SM_RUN_ENCODE(run_len
);
504 if (entry
!= entry_map
) {
505 size
= (entry
- entry_map
) * sizeof (uint64_t);
506 mutex_exit(sm
->sm_lock
);
507 dmu_write(os
, smo
->smo_object
, smo
->smo_objsize
,
508 size
, entry_map
, tx
);
509 mutex_enter(sm
->sm_lock
);
510 smo
->smo_objsize
+= size
;
514 * Ensure that the space_map's accounting wasn't changed
515 * while we were in the middle of writing it out.
517 VERIFY3U(nodes
, ==, avl_numnodes(&sm
->sm_root
));
518 VERIFY3U(sm
->sm_space
, ==, sm_space
);
519 VERIFY3U(sm
->sm_space
, ==, total
);
521 zio_buf_free(entry_map
, bufsize
);
525 space_map_truncate(space_map_obj_t
*smo
, objset_t
*os
, dmu_tx_t
*tx
)
527 VERIFY(dmu_free_range(os
, smo
->smo_object
, 0, -1ULL, tx
) == 0);
529 smo
->smo_objsize
= 0;
534 * Space map reference trees.
536 * A space map is a collection of integers. Every integer is either
537 * in the map, or it's not. A space map reference tree generalizes
538 * the idea: it allows its members to have arbitrary reference counts,
539 * as opposed to the implicit reference count of 0 or 1 in a space map.
540 * This representation comes in handy when computing the union or
541 * intersection of multiple space maps. For example, the union of
542 * N space maps is the subset of the reference tree with refcnt >= 1.
543 * The intersection of N space maps is the subset with refcnt >= N.
545 * [It's very much like a Fourier transform. Unions and intersections
546 * are hard to perform in the 'space map domain', so we convert the maps
547 * into the 'reference count domain', where it's trivial, then invert.]
549 * vdev_dtl_reassess() uses computations of this form to determine
550 * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
551 * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
552 * has an outage wherever refcnt >= vdev_children.
555 space_map_ref_compare(const void *x1
, const void *x2
)
557 const space_ref_t
*sr1
= x1
;
558 const space_ref_t
*sr2
= x2
;
560 if (sr1
->sr_offset
< sr2
->sr_offset
)
562 if (sr1
->sr_offset
> sr2
->sr_offset
)
574 space_map_ref_create(avl_tree_t
*t
)
576 avl_create(t
, space_map_ref_compare
,
577 sizeof (space_ref_t
), offsetof(space_ref_t
, sr_node
));
581 space_map_ref_destroy(avl_tree_t
*t
)
586 while ((sr
= avl_destroy_nodes(t
, &cookie
)) != NULL
)
587 kmem_free(sr
, sizeof (*sr
));
593 space_map_ref_add_node(avl_tree_t
*t
, uint64_t offset
, int64_t refcnt
)
597 sr
= kmem_alloc(sizeof (*sr
), KM_PUSHPAGE
);
598 sr
->sr_offset
= offset
;
599 sr
->sr_refcnt
= refcnt
;
605 space_map_ref_add_seg(avl_tree_t
*t
, uint64_t start
, uint64_t end
,
608 space_map_ref_add_node(t
, start
, refcnt
);
609 space_map_ref_add_node(t
, end
, -refcnt
);
613 * Convert (or add) a space map into a reference tree.
616 space_map_ref_add_map(avl_tree_t
*t
, space_map_t
*sm
, int64_t refcnt
)
620 ASSERT(MUTEX_HELD(sm
->sm_lock
));
622 for (ss
= avl_first(&sm
->sm_root
); ss
; ss
= AVL_NEXT(&sm
->sm_root
, ss
))
623 space_map_ref_add_seg(t
, ss
->ss_start
, ss
->ss_end
, refcnt
);
627 * Convert a reference tree into a space map. The space map will contain
628 * all members of the reference tree for which refcnt >= minref.
631 space_map_ref_generate_map(avl_tree_t
*t
, space_map_t
*sm
, int64_t minref
)
633 uint64_t start
= -1ULL;
637 ASSERT(MUTEX_HELD(sm
->sm_lock
));
639 space_map_vacate(sm
, NULL
, NULL
);
641 for (sr
= avl_first(t
); sr
!= NULL
; sr
= AVL_NEXT(t
, sr
)) {
642 refcnt
+= sr
->sr_refcnt
;
643 if (refcnt
>= minref
) {
644 if (start
== -1ULL) {
645 start
= sr
->sr_offset
;
648 if (start
!= -1ULL) {
649 uint64_t end
= sr
->sr_offset
;
650 ASSERT(start
<= end
);
652 space_map_add(sm
, start
, end
- start
);
658 ASSERT(start
== -1ULL);