]>
git.proxmox.com Git - mirror_zfs-debian.git/blob - zfs/lib/libzpool/dmu_traverse.c
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "@(#)dmu_traverse.c 1.7 08/04/01 SMI"
28 #include <sys/zfs_context.h>
29 #include <sys/dmu_objset.h>
30 #include <sys/dmu_traverse.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_dir.h>
33 #include <sys/dsl_pool.h>
34 #include <sys/dnode.h>
37 #include <sys/dmu_impl.h>
40 #define BP_SPAN_SHIFT(level, width) ((level) * (width))
42 #define BP_EQUAL(b1, b2) \
43 (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
44 (b1)->blk_birth == (b2)->blk_birth)
47 * Compare two bookmarks.
49 * For ADVANCE_PRE, the visitation order is:
51 * objset 0, 1, 2, ..., ZB_MAXOBJSET.
52 * object 0, 1, 2, ..., ZB_MAXOBJECT.
54 * level ZB_MAXLEVEL, ..., 2, 1, 0.
56 * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
59 * < objset, object, blkoff, -level >
61 * For ADVANCE_POST, the starting offsets aren't sequential but ending
62 * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
63 * The visitation order is:
65 * objset 1, 2, ..., ZB_MAXOBJSET, 0.
66 * object 1, 2, ..., ZB_MAXOBJECT, 0.
68 * level 0, 1, 2, ..., ZB_MAXLEVEL.
70 * and thus a valid ordering vector is:
72 * < objset - 1, object - 1, blkoff, level >
74 * Both orderings can be expressed as:
76 * < objset + bias, object + bias, blkoff, level ^ bias >
78 * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
79 * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
81 * Special case: an objset's osphys is represented as level -1 of object 0.
82 * It is always either the very first or very last block we visit in an objset.
83 * Therefore, if either bookmark's level is -1, level alone determines order.
86 compare_bookmark(zbookmark_t
*szb
, zbookmark_t
*ezb
, dnode_phys_t
*dnp
,
89 int bias
= (advance
& ADVANCE_PRE
) ? 0 : -1;
90 uint64_t sblkoff
, eblkoff
;
91 int slevel
, elevel
, wshift
;
93 if (szb
->zb_objset
+ bias
< ezb
->zb_objset
+ bias
)
96 if (szb
->zb_objset
+ bias
> ezb
->zb_objset
+ bias
)
99 slevel
= szb
->zb_level
;
100 elevel
= ezb
->zb_level
;
102 if ((slevel
| elevel
) < 0)
103 return ((slevel
^ bias
) - (elevel
^ bias
));
105 if (szb
->zb_object
+ bias
< ezb
->zb_object
+ bias
)
108 if (szb
->zb_object
+ bias
> ezb
->zb_object
+ bias
)
114 wshift
= dnp
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
116 sblkoff
= (szb
->zb_blkid
- bias
) << BP_SPAN_SHIFT(slevel
, wshift
);
117 eblkoff
= (ezb
->zb_blkid
- bias
) << BP_SPAN_SHIFT(elevel
, wshift
);
119 if (sblkoff
< eblkoff
)
122 if (sblkoff
> eblkoff
)
125 return ((elevel
^ bias
) - (slevel
^ bias
));
128 #define SET_BOOKMARK(zb, objset, object, level, blkid) \
130 (zb)->zb_objset = objset; \
131 (zb)->zb_object = object; \
132 (zb)->zb_level = level; \
133 (zb)->zb_blkid = blkid; \
136 #define SET_BOOKMARK_LB(zb, level, blkid) \
138 (zb)->zb_level = level; \
139 (zb)->zb_blkid = blkid; \
143 advance_objset(zseg_t
*zseg
, uint64_t objset
, int advance
)
145 zbookmark_t
*zb
= &zseg
->seg_start
;
147 if (advance
& ADVANCE_PRE
) {
148 if (objset
>= ZB_MAXOBJSET
)
150 SET_BOOKMARK(zb
, objset
, 0, -1, 0);
152 if (objset
>= ZB_MAXOBJSET
)
154 SET_BOOKMARK(zb
, objset
, 1, 0, 0);
157 if (compare_bookmark(zb
, &zseg
->seg_end
, NULL
, advance
) > 0)
164 advance_object(zseg_t
*zseg
, uint64_t object
, int advance
)
166 zbookmark_t
*zb
= &zseg
->seg_start
;
168 if (advance
& ADVANCE_PRE
) {
169 if (object
>= ZB_MAXOBJECT
) {
170 SET_BOOKMARK(zb
, zb
->zb_objset
+ 1, 0, -1, 0);
172 SET_BOOKMARK(zb
, zb
->zb_objset
, object
, ZB_MAXLEVEL
, 0);
175 if (zb
->zb_object
== 0) {
176 SET_BOOKMARK(zb
, zb
->zb_objset
, 0, -1, 0);
178 if (object
>= ZB_MAXOBJECT
)
180 SET_BOOKMARK(zb
, zb
->zb_objset
, object
, 0, 0);
184 if (compare_bookmark(zb
, &zseg
->seg_end
, NULL
, advance
) > 0)
191 advance_from_osphys(zseg_t
*zseg
, int advance
)
193 zbookmark_t
*zb
= &zseg
->seg_start
;
195 ASSERT(zb
->zb_object
== 0);
196 ASSERT(zb
->zb_level
== -1);
197 ASSERT(zb
->zb_blkid
== 0);
199 if (advance
& ADVANCE_PRE
) {
200 SET_BOOKMARK_LB(zb
, ZB_MAXLEVEL
, 0);
202 if (zb
->zb_objset
== 0)
204 SET_BOOKMARK(zb
, zb
->zb_objset
+ 1, 1, 0, 0);
207 if (compare_bookmark(zb
, &zseg
->seg_end
, NULL
, advance
) > 0)
214 advance_block(zseg_t
*zseg
, dnode_phys_t
*dnp
, int rc
, int advance
)
216 zbookmark_t
*zb
= &zseg
->seg_start
;
217 int wshift
= dnp
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
218 int maxlevel
= dnp
->dn_nlevels
- 1;
219 int level
= zb
->zb_level
;
220 uint64_t blkid
= zb
->zb_blkid
;
222 if (advance
& ADVANCE_PRE
) {
223 if (level
> 0 && rc
== 0) {
229 if ((blkid
<< BP_SPAN_SHIFT(level
, wshift
)) >
233 while (level
< maxlevel
) {
234 if (P2PHASE(blkid
, 1ULL << wshift
))
241 if (level
>= maxlevel
|| P2PHASE(blkid
+ 1, 1ULL << wshift
)) {
242 blkid
= (blkid
+ 1) << BP_SPAN_SHIFT(level
, wshift
);
249 while ((blkid
<< BP_SPAN_SHIFT(level
, wshift
)) >
251 if (level
== maxlevel
)
257 SET_BOOKMARK_LB(zb
, level
, blkid
);
259 if (compare_bookmark(zb
, &zseg
->seg_end
, dnp
, advance
) > 0)
266 * The traverse_callback function will call the function specified in th_func.
267 * In the event of an error the callee, specified by th_func, must return
268 * one of the following errors:
270 * EINTR - Indicates that the callee wants the traversal to
272 * ERESTART - The callee has acknowledged the error and would
276 traverse_callback(traverse_handle_t
*th
, zseg_t
*zseg
, traverse_blk_cache_t
*bc
)
279 * Before we issue the callback, prune against maxtxg.
281 * We prune against mintxg before we get here because it's a big win.
282 * If a given block was born in txg 37, then we know that the entire
283 * subtree below that block must have been born in txg 37 or earlier.
284 * We can therefore lop off huge branches of the tree as we go.
286 * There's no corresponding optimization for maxtxg because knowing
287 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
288 * children. In fact, the copy-on-write design of ZFS ensures that
289 * top-level blocks will pretty much always be new.
291 * Therefore, in the name of simplicity we don't prune against
292 * maxtxg until the last possible moment -- that being right now.
294 if (bc
->bc_errno
== 0 && bc
->bc_blkptr
.blk_birth
>= zseg
->seg_maxtxg
)
298 * Debugging: verify that the order we visit things agrees with the
299 * order defined by compare_bookmark(). We don't check this for
300 * log blocks because there's no defined ordering for them; they're
301 * always visited (or not) as part of visiting the objset_phys_t.
303 if (bc
->bc_errno
== 0 && bc
!= &th
->th_zil_cache
) {
304 zbookmark_t
*zb
= &bc
->bc_bookmark
;
305 zbookmark_t
*szb
= &zseg
->seg_start
;
306 zbookmark_t
*ezb
= &zseg
->seg_end
;
307 zbookmark_t
*lzb
= &th
->th_lastcb
;
308 dnode_phys_t
*dnp
= bc
->bc_dnode
;
310 ASSERT(compare_bookmark(zb
, ezb
, dnp
, th
->th_advance
) <= 0);
311 ASSERT(compare_bookmark(zb
, szb
, dnp
, th
->th_advance
) == 0);
312 ASSERT(compare_bookmark(lzb
, zb
, dnp
, th
->th_advance
) < 0 ||
313 lzb
->zb_level
== ZB_NO_LEVEL
);
318 return (th
->th_func(bc
, th
->th_spa
, th
->th_arg
));
322 traverse_read(traverse_handle_t
*th
, traverse_blk_cache_t
*bc
, blkptr_t
*bp
,
325 zbookmark_t
*zb
= &bc
->bc_bookmark
;
333 if (BP_EQUAL(&bc
->bc_blkptr
, bp
))
338 if (bc
->bc_data
== NULL
)
341 if (BP_IS_HOLE(bp
)) {
342 ASSERT(th
->th_advance
& ADVANCE_HOLES
);
346 if (compare_bookmark(zb
, &th
->th_noread
, dnp
, 0) == 0) {
348 } else if (arc_tryread(th
->th_spa
, bp
, bc
->bc_data
) == 0) {
352 error
= zio_wait(zio_read(NULL
, th
->th_spa
, bp
, bc
->bc_data
,
353 BP_GET_LSIZE(bp
), NULL
, NULL
, ZIO_PRIORITY_SYNC_READ
,
354 th
->th_zio_flags
| ZIO_FLAG_DONT_CACHE
, zb
));
356 if (BP_SHOULD_BYTESWAP(bp
) && error
== 0)
357 (zb
->zb_level
> 0 ? byteswap_uint64_array
:
358 dmu_ot
[BP_GET_TYPE(bp
)].ot_byteswap
)(bc
->bc_data
,
364 bc
->bc_errno
= error
;
365 error
= traverse_callback(th
, NULL
, bc
);
366 ASSERT(error
== EAGAIN
|| error
== EINTR
|| error
== ERESTART
);
367 bc
->bc_blkptr
.blk_birth
= -1ULL;
370 dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
371 bc
- &th
->th_cache
[0][0], error
,
372 zb
->zb_objset
, zb
->zb_object
, zb
->zb_level
, zb
->zb_blkid
);
378 find_block(traverse_handle_t
*th
, zseg_t
*zseg
, dnode_phys_t
*dnp
, int depth
)
380 zbookmark_t
*zb
= &zseg
->seg_start
;
381 traverse_blk_cache_t
*bc
;
382 blkptr_t
*bp
= dnp
->dn_blkptr
;
384 int nbp
= dnp
->dn_nblkptr
;
385 int minlevel
= zb
->zb_level
;
386 int maxlevel
= dnp
->dn_nlevels
- 1;
387 int wshift
= dnp
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
388 int bp_shift
= BP_SPAN_SHIFT(maxlevel
- minlevel
, wshift
);
389 uint64_t blkid
= zb
->zb_blkid
>> bp_shift
;
390 int do_holes
= (th
->th_advance
& ADVANCE_HOLES
) && depth
== ZB_DN_CACHE
;
393 if (minlevel
> maxlevel
|| blkid
>= nbp
)
396 for (level
= maxlevel
; level
>= minlevel
; level
--) {
397 first
= P2PHASE(blkid
, 1ULL << wshift
);
399 for (i
= first
; i
< nbp
; i
++)
400 if (bp
[i
].blk_birth
> zseg
->seg_mintxg
||
401 BP_IS_HOLE(&bp
[i
]) && do_holes
)
406 SET_BOOKMARK_LB(zb
, level
, blkid
+ (i
- first
));
410 bc
= &th
->th_cache
[depth
][level
];
412 SET_BOOKMARK(&bc
->bc_bookmark
, zb
->zb_objset
, zb
->zb_object
,
415 if (rc
= traverse_read(th
, bc
, bp
+ i
, dnp
)) {
417 SET_BOOKMARK_LB(zb
, level
, blkid
);
422 if (BP_IS_HOLE(&bp
[i
])) {
423 SET_BOOKMARK_LB(zb
, level
, blkid
);
424 th
->th_lastcb
.zb_level
= ZB_NO_LEVEL
;
431 blkid
= zb
->zb_blkid
>> bp_shift
;
438 get_dnode(traverse_handle_t
*th
, uint64_t objset
, dnode_phys_t
*mdn
,
439 uint64_t *objectp
, dnode_phys_t
**dnpp
, uint64_t txg
, int type
, int depth
)
442 zbookmark_t
*zb
= &zseg
.seg_start
;
443 uint64_t object
= *objectp
;
446 SET_BOOKMARK(zb
, objset
, 0, 0, object
/ DNODES_PER_BLOCK
);
447 SET_BOOKMARK(&zseg
.seg_end
, objset
, 0, 0, ZB_MAXBLKID
);
449 zseg
.seg_mintxg
= txg
;
450 zseg
.seg_maxtxg
= -1ULL;
453 rc
= find_block(th
, &zseg
, mdn
, depth
);
455 if (rc
== EAGAIN
|| rc
== EINTR
|| rc
== ERANGE
)
458 if (rc
== 0 && zb
->zb_level
== 0) {
459 dnode_phys_t
*dnp
= th
->th_cache
[depth
][0].bc_data
;
460 for (i
= 0; i
< DNODES_PER_BLOCK
; i
++) {
461 object
= (zb
->zb_blkid
* DNODES_PER_BLOCK
) + i
;
462 if (object
>= *objectp
&&
463 dnp
[i
].dn_type
!= DMU_OT_NONE
&&
464 (type
== -1 || dnp
[i
].dn_type
== type
)) {
472 rc
= advance_block(&zseg
, mdn
, rc
, ADVANCE_PRE
);
479 *objectp
= ZB_MAXOBJECT
;
486 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
488 traverse_handle_t
*th
= arg
;
489 traverse_blk_cache_t
*bc
= &th
->th_zil_cache
;
490 zbookmark_t
*zb
= &bc
->bc_bookmark
;
491 zseg_t
*zseg
= list_head(&th
->th_seglist
);
493 if (bp
->blk_birth
<= zseg
->seg_mintxg
)
496 if (claim_txg
!= 0 || bp
->blk_birth
< spa_first_txg(th
->th_spa
)) {
498 zb
->zb_blkid
= bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
];
500 (void) traverse_callback(th
, zseg
, bc
);
506 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
508 traverse_handle_t
*th
= arg
;
509 traverse_blk_cache_t
*bc
= &th
->th_zil_cache
;
510 zbookmark_t
*zb
= &bc
->bc_bookmark
;
511 zseg_t
*zseg
= list_head(&th
->th_seglist
);
513 if (lrc
->lrc_txtype
== TX_WRITE
) {
514 lr_write_t
*lr
= (lr_write_t
*)lrc
;
515 blkptr_t
*bp
= &lr
->lr_blkptr
;
517 if (bp
->blk_birth
<= zseg
->seg_mintxg
)
520 if (claim_txg
!= 0 && bp
->blk_birth
>= claim_txg
) {
521 zb
->zb_object
= lr
->lr_foid
;
522 zb
->zb_blkid
= lr
->lr_offset
/ BP_GET_LSIZE(bp
);
524 (void) traverse_callback(th
, zseg
, bc
);
530 traverse_zil(traverse_handle_t
*th
, traverse_blk_cache_t
*bc
)
532 spa_t
*spa
= th
->th_spa
;
533 dsl_pool_t
*dp
= spa_get_dsl(spa
);
534 objset_phys_t
*osphys
= bc
->bc_data
;
535 zil_header_t
*zh
= &osphys
->os_zil_header
;
536 uint64_t claim_txg
= zh
->zh_claim_txg
;
539 ASSERT(bc
== &th
->th_cache
[ZB_MDN_CACHE
][ZB_MAXLEVEL
- 1]);
540 ASSERT(bc
->bc_bookmark
.zb_level
== -1);
543 * We only want to visit blocks that have been claimed but not yet
544 * replayed (or, in read-only mode, blocks that *would* be claimed).
546 if (claim_txg
== 0 && (spa_mode
& FWRITE
))
549 th
->th_zil_cache
.bc_bookmark
= bc
->bc_bookmark
;
551 zilog
= zil_alloc(dp
->dp_meta_objset
, zh
);
553 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, th
,
560 traverse_segment(traverse_handle_t
*th
, zseg_t
*zseg
, blkptr_t
*mosbp
)
562 zbookmark_t
*zb
= &zseg
->seg_start
;
563 traverse_blk_cache_t
*bc
;
564 dnode_phys_t
*dn
, *dn_tmp
;
568 dprintf("<%llu, %llu, %d, %llx>\n",
569 zb
->zb_objset
, zb
->zb_object
, zb
->zb_level
, zb
->zb_blkid
);
571 bc
= &th
->th_cache
[ZB_MOS_CACHE
][ZB_MAXLEVEL
- 1];
572 dn
= &((objset_phys_t
*)bc
->bc_data
)->os_meta_dnode
;
574 SET_BOOKMARK(&bc
->bc_bookmark
, 0, 0, -1, 0);
576 rc
= traverse_read(th
, bc
, mosbp
, dn
);
578 if (rc
) /* If we get ERESTART, we've got nowhere left to go */
579 return (rc
== ERESTART
? EINTR
: rc
);
581 ASSERT(dn
->dn_nlevels
< ZB_MAXLEVEL
);
583 if (zb
->zb_objset
!= 0) {
584 uint64_t objset
= zb
->zb_objset
;
585 dsl_dataset_phys_t
*dsp
;
587 rc
= get_dnode(th
, 0, dn
, &objset
, &dn_tmp
, 0,
588 DMU_OT_DSL_DATASET
, ZB_MOS_CACHE
);
590 if (objset
!= zb
->zb_objset
)
591 rc
= advance_objset(zseg
, objset
, th
->th_advance
);
596 dsp
= DN_BONUS(dn_tmp
);
598 bc
= &th
->th_cache
[ZB_MDN_CACHE
][ZB_MAXLEVEL
- 1];
599 dn
= &((objset_phys_t
*)bc
->bc_data
)->os_meta_dnode
;
601 SET_BOOKMARK(&bc
->bc_bookmark
, objset
, 0, -1, 0);
604 * If we're traversing an open snapshot, we know that it
605 * can't be deleted (because it's open) and it can't change
606 * (because it's a snapshot). Therefore, once we've gotten
607 * from the uberblock down to the snapshot's objset_phys_t,
608 * we no longer need to synchronize with spa_sync(); we're
609 * traversing a completely static block tree from here on.
611 if (th
->th_advance
& ADVANCE_NOLOCK
) {
612 ASSERT(th
->th_locked
);
613 rw_exit(spa_traverse_rwlock(th
->th_spa
));
617 rc
= traverse_read(th
, bc
, &dsp
->ds_bp
, dn
);
621 rc
= advance_objset(zseg
, zb
->zb_objset
+ 1,
626 if (th
->th_advance
& ADVANCE_PRUNE
)
628 MAX(zseg
->seg_mintxg
, dsp
->ds_prev_snap_txg
);
631 if (zb
->zb_level
== -1) {
632 ASSERT(zb
->zb_object
== 0);
633 ASSERT(zb
->zb_blkid
== 0);
634 ASSERT(BP_GET_TYPE(&bc
->bc_blkptr
) == DMU_OT_OBJSET
);
636 if (bc
->bc_blkptr
.blk_birth
> zseg
->seg_mintxg
) {
637 rc
= traverse_callback(th
, zseg
, bc
);
642 if ((th
->th_advance
& ADVANCE_ZIL
) &&
644 traverse_zil(th
, bc
);
647 return (advance_from_osphys(zseg
, th
->th_advance
));
650 if (zb
->zb_object
!= 0) {
651 uint64_t object
= zb
->zb_object
;
653 rc
= get_dnode(th
, zb
->zb_objset
, dn
, &object
, &dn_tmp
,
654 zseg
->seg_mintxg
, -1, ZB_MDN_CACHE
);
656 if (object
!= zb
->zb_object
)
657 rc
= advance_object(zseg
, object
, th
->th_advance
);
665 if (zb
->zb_level
== ZB_MAXLEVEL
)
666 zb
->zb_level
= dn
->dn_nlevels
- 1;
669 rc
= find_block(th
, zseg
, dn
, ZB_DN_CACHE
);
671 if (rc
== EAGAIN
|| rc
== EINTR
|| rc
== ERANGE
)
675 bc
= &th
->th_cache
[ZB_DN_CACHE
][zb
->zb_level
];
676 ASSERT(bc
->bc_dnode
== dn
);
677 ASSERT(bc
->bc_blkptr
.blk_birth
<= mosbp
->blk_birth
);
678 rc
= traverse_callback(th
, zseg
, bc
);
683 if (BP_IS_HOLE(&bc
->bc_blkptr
)) {
684 ASSERT(th
->th_advance
& ADVANCE_HOLES
);
689 rc
= advance_block(zseg
, dn
, rc
, th
->th_advance
);
695 * Give spa_sync() a chance to run.
697 if (th
->th_locked
&& spa_traverse_wanted(th
->th_spa
)) {
702 if (--worklimit
== 0)
707 rc
= advance_object(zseg
, zb
->zb_object
+ 1, th
->th_advance
);
713 * It is the caller's responsibility to ensure that the dsl_dataset_t
714 * doesn't go away during traversal.
717 traverse_dsl_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int advance
,
718 blkptr_cb_t func
, void *arg
)
720 spa_t
*spa
= ds
->ds_dir
->dd_pool
->dp_spa
;
721 traverse_handle_t
*th
;
724 th
= traverse_init(spa
, func
, arg
, advance
, ZIO_FLAG_MUSTSUCCEED
);
726 traverse_add_objset(th
, txg_start
, -1ULL, ds
->ds_object
);
728 while ((err
= traverse_more(th
)) == EAGAIN
)
736 traverse_zvol(objset_t
*os
, int advance
, blkptr_cb_t func
, void *arg
)
738 spa_t
*spa
= dmu_objset_spa(os
);
739 traverse_handle_t
*th
;
742 th
= traverse_init(spa
, func
, arg
, advance
, ZIO_FLAG_CANFAIL
);
744 traverse_add_dnode(th
, 0, -1ULL, dmu_objset_id(os
), ZVOL_OBJ
);
746 while ((err
= traverse_more(th
)) == EAGAIN
)
754 traverse_more(traverse_handle_t
*th
)
756 zseg_t
*zseg
= list_head(&th
->th_seglist
);
757 uint64_t save_txg
; /* XXX won't be necessary with real itinerary */
758 krwlock_t
*rw
= spa_traverse_rwlock(th
->th_spa
);
759 blkptr_t
*mosbp
= spa_get_rootblkptr(th
->th_spa
);
767 save_txg
= zseg
->seg_mintxg
;
769 rw_enter(rw
, RW_READER
);
772 rc
= traverse_segment(th
, zseg
, mosbp
);
773 ASSERT(rc
== ERANGE
|| rc
== EAGAIN
|| rc
== EINTR
);
779 zseg
->seg_mintxg
= save_txg
;
782 list_remove(&th
->th_seglist
, zseg
);
783 kmem_free(zseg
, sizeof (*zseg
));
791 * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
792 * are not included. The blocks covered by this segment will all have
793 * mintxg < birth < maxtxg.
796 traverse_add_segment(traverse_handle_t
*th
, uint64_t mintxg
, uint64_t maxtxg
,
797 uint64_t sobjset
, uint64_t sobject
, int slevel
, uint64_t sblkid
,
798 uint64_t eobjset
, uint64_t eobject
, int elevel
, uint64_t eblkid
)
802 zseg
= kmem_alloc(sizeof (zseg_t
), KM_SLEEP
);
804 zseg
->seg_mintxg
= mintxg
;
805 zseg
->seg_maxtxg
= maxtxg
;
807 zseg
->seg_start
.zb_objset
= sobjset
;
808 zseg
->seg_start
.zb_object
= sobject
;
809 zseg
->seg_start
.zb_level
= slevel
;
810 zseg
->seg_start
.zb_blkid
= sblkid
;
812 zseg
->seg_end
.zb_objset
= eobjset
;
813 zseg
->seg_end
.zb_object
= eobject
;
814 zseg
->seg_end
.zb_level
= elevel
;
815 zseg
->seg_end
.zb_blkid
= eblkid
;
817 list_insert_tail(&th
->th_seglist
, zseg
);
821 traverse_add_dnode(traverse_handle_t
*th
, uint64_t mintxg
, uint64_t maxtxg
,
822 uint64_t objset
, uint64_t object
)
824 if (th
->th_advance
& ADVANCE_PRE
)
825 traverse_add_segment(th
, mintxg
, maxtxg
,
826 objset
, object
, ZB_MAXLEVEL
, 0,
827 objset
, object
, 0, ZB_MAXBLKID
);
829 traverse_add_segment(th
, mintxg
, maxtxg
,
830 objset
, object
, 0, 0,
831 objset
, object
, 0, ZB_MAXBLKID
);
835 traverse_add_objset(traverse_handle_t
*th
, uint64_t mintxg
, uint64_t maxtxg
,
838 if (th
->th_advance
& ADVANCE_PRE
)
839 traverse_add_segment(th
, mintxg
, maxtxg
,
841 objset
, ZB_MAXOBJECT
, 0, ZB_MAXBLKID
);
843 traverse_add_segment(th
, mintxg
, maxtxg
,
849 traverse_add_pool(traverse_handle_t
*th
, uint64_t mintxg
, uint64_t maxtxg
)
851 if (th
->th_advance
& ADVANCE_PRE
)
852 traverse_add_segment(th
, mintxg
, maxtxg
,
854 ZB_MAXOBJSET
, ZB_MAXOBJECT
, 0, ZB_MAXBLKID
);
856 traverse_add_segment(th
, mintxg
, maxtxg
,
862 traverse_init(spa_t
*spa
, blkptr_cb_t func
, void *arg
, int advance
,
865 traverse_handle_t
*th
;
868 th
= kmem_zalloc(sizeof (*th
), KM_SLEEP
);
873 th
->th_advance
= advance
;
874 th
->th_lastcb
.zb_level
= ZB_NO_LEVEL
;
875 th
->th_noread
.zb_level
= ZB_NO_LEVEL
;
876 th
->th_zio_flags
= zio_flags
;
878 list_create(&th
->th_seglist
, sizeof (zseg_t
),
879 offsetof(zseg_t
, seg_node
));
881 for (d
= 0; d
< ZB_DEPTH
; d
++) {
882 for (l
= 0; l
< ZB_MAXLEVEL
; l
++) {
883 if ((advance
& ADVANCE_DATA
) ||
884 l
!= 0 || d
!= ZB_DN_CACHE
)
885 th
->th_cache
[d
][l
].bc_data
=
886 zio_buf_alloc(SPA_MAXBLOCKSIZE
);
894 traverse_fini(traverse_handle_t
*th
)
899 for (d
= 0; d
< ZB_DEPTH
; d
++)
900 for (l
= 0; l
< ZB_MAXLEVEL
; l
++)
901 if (th
->th_cache
[d
][l
].bc_data
!= NULL
)
902 zio_buf_free(th
->th_cache
[d
][l
].bc_data
,
905 while ((zseg
= list_head(&th
->th_seglist
)) != NULL
) {
906 list_remove(&th
->th_seglist
, zseg
);
907 kmem_free(zseg
, sizeof (*zseg
));
910 list_destroy(&th
->th_seglist
);
912 dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
913 th
->th_hits
, th
->th_arc_hits
, th
->th_reads
, th
->th_callbacks
,
914 th
->th_syncs
, th
->th_restarts
);
916 kmem_free(th
, sizeof (*th
));