4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
35 #include <sys/dmu_impl.h>
37 #include <sys/sa_impl.h>
38 #include <sys/callb.h>
39 #include <sys/zfeature.h>
41 int zfs_pd_blks_max
= 100;
43 typedef struct prefetch_data
{
53 typedef struct traverse_data
{
58 zbookmark_phys_t
*td_resume
;
60 prefetch_data_t
*td_pfd
;
66 static int traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
67 uint64_t objset
, uint64_t object
);
68 static void prefetch_dnode_metadata(traverse_data_t
*td
, const dnode_phys_t
*,
69 uint64_t objset
, uint64_t object
);
72 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
74 traverse_data_t
*td
= arg
;
80 if (claim_txg
== 0 && bp
->blk_birth
>= spa_first_txg(td
->td_spa
))
83 SET_BOOKMARK(&zb
, td
->td_objset
, ZB_ZIL_OBJECT
, ZB_ZIL_LEVEL
,
84 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]);
86 (void) td
->td_func(td
->td_spa
, zilog
, bp
, &zb
, NULL
, td
->td_arg
);
92 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
94 traverse_data_t
*td
= arg
;
96 if (lrc
->lrc_txtype
== TX_WRITE
) {
97 lr_write_t
*lr
= (lr_write_t
*)lrc
;
98 blkptr_t
*bp
= &lr
->lr_blkptr
;
104 if (claim_txg
== 0 || bp
->blk_birth
< claim_txg
)
107 SET_BOOKMARK(&zb
, td
->td_objset
, lr
->lr_foid
,
108 ZB_ZIL_LEVEL
, lr
->lr_offset
/ BP_GET_LSIZE(bp
));
110 (void) td
->td_func(td
->td_spa
, zilog
, bp
, &zb
, NULL
,
117 traverse_zil(traverse_data_t
*td
, zil_header_t
*zh
)
119 uint64_t claim_txg
= zh
->zh_claim_txg
;
123 * We only want to visit blocks that have been claimed but not yet
124 * replayed; plus, in read-only mode, blocks that are already stable.
126 if (claim_txg
== 0 && spa_writeable(td
->td_spa
))
129 zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
131 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
137 typedef enum resume_skip
{
144 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
145 * the block indicated by zb does not need to be visited at all. Returns
146 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
147 * resume point. This indicates that this block should be visited but not its
148 * children (since they must have been visited in a previous traversal).
149 * Otherwise returns RESUME_SKIP_NONE.
152 resume_skip_check(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
153 const zbookmark_phys_t
*zb
)
155 if (td
->td_resume
!= NULL
&& !ZB_IS_ZERO(td
->td_resume
)) {
157 * If we already visited this bp & everything below,
158 * don't bother doing it again.
160 if (zbookmark_is_before(dnp
, zb
, td
->td_resume
))
161 return (RESUME_SKIP_ALL
);
164 * If we found the block we're trying to resume from, zero
165 * the bookmark out to indicate that we have resumed.
167 if (bcmp(zb
, td
->td_resume
, sizeof (*zb
)) == 0) {
168 bzero(td
->td_resume
, sizeof (*zb
));
169 if (td
->td_flags
& TRAVERSE_POST
)
170 return (RESUME_SKIP_CHILDREN
);
173 return (RESUME_SKIP_NONE
);
177 traverse_prefetch_metadata(traverse_data_t
*td
,
178 const blkptr_t
*bp
, const zbookmark_phys_t
*zb
)
180 uint32_t flags
= ARC_NOWAIT
| ARC_PREFETCH
;
182 if (!(td
->td_flags
& TRAVERSE_PREFETCH_METADATA
))
185 * If we are in the process of resuming, don't prefetch, because
186 * some children will not be needed (and in fact may have already
189 if (td
->td_resume
!= NULL
&& !ZB_IS_ZERO(td
->td_resume
))
191 if (BP_IS_HOLE(bp
) || bp
->blk_birth
<= td
->td_min_txg
)
193 if (BP_GET_LEVEL(bp
) == 0 && BP_GET_TYPE(bp
) != DMU_OT_DNODE
)
196 (void) arc_read(NULL
, td
->td_spa
, bp
, NULL
, NULL
,
197 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
201 prefetch_needed(prefetch_data_t
*pfd
, const blkptr_t
*bp
)
203 ASSERT(pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
);
204 if (BP_IS_HOLE(bp
) || BP_IS_EMBEDDED(bp
) ||
205 BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
)
211 traverse_visitbp(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
212 const blkptr_t
*bp
, const zbookmark_phys_t
*zb
)
215 arc_buf_t
*buf
= NULL
;
217 switch (resume_skip_check(td
, dnp
, zb
)) {
218 case RESUME_SKIP_ALL
:
220 case RESUME_SKIP_CHILDREN
:
222 case RESUME_SKIP_NONE
:
228 if (bp
->blk_birth
== 0) {
229 if (spa_feature_is_active(td
->td_spa
, SPA_FEATURE_HOLE_BIRTH
)) {
231 * Since this block has a birth time of 0 it must be a
232 * hole created before the SPA_FEATURE_HOLE_BIRTH
233 * feature was enabled. If SPA_FEATURE_HOLE_BIRTH
234 * was enabled before the min_txg for this traveral we
235 * know the hole must have been created before the
236 * min_txg for this traveral, so we can skip it. If
237 * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
238 * for this traveral we cannot tell if the hole was
239 * created before or after the min_txg for this
240 * traversal, so we cannot skip it.
242 uint64_t hole_birth_enabled_txg
;
243 VERIFY(spa_feature_enabled_txg(td
->td_spa
,
244 SPA_FEATURE_HOLE_BIRTH
, &hole_birth_enabled_txg
));
245 if (hole_birth_enabled_txg
< td
->td_min_txg
)
248 } else if (bp
->blk_birth
<= td
->td_min_txg
) {
252 if (td
->td_pfd
!= NULL
&& !td
->td_pfd
->pd_exited
&&
253 prefetch_needed(td
->td_pfd
, bp
)) {
254 mutex_enter(&td
->td_pfd
->pd_mtx
);
255 ASSERT(td
->td_pfd
->pd_blks_fetched
>= 0);
256 while (td
->td_pfd
->pd_blks_fetched
== 0 &&
257 !td
->td_pfd
->pd_exited
)
258 cv_wait(&td
->td_pfd
->pd_cv
, &td
->td_pfd
->pd_mtx
);
259 td
->td_pfd
->pd_blks_fetched
--;
260 cv_broadcast(&td
->td_pfd
->pd_cv
);
261 mutex_exit(&td
->td_pfd
->pd_mtx
);
264 if (BP_IS_HOLE(bp
)) {
265 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
, td
->td_arg
);
271 if (td
->td_flags
& TRAVERSE_PRE
) {
272 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
,
274 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
280 if (BP_GET_LEVEL(bp
) > 0) {
281 uint32_t flags
= ARC_WAIT
;
283 int32_t epb
= BP_GET_LSIZE(bp
) >> SPA_BLKPTRSHIFT
;
284 zbookmark_phys_t
*czb
;
286 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
287 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
291 czb
= kmem_alloc(sizeof (zbookmark_phys_t
), KM_SLEEP
);
293 for (i
= 0; i
< epb
; i
++) {
294 SET_BOOKMARK(czb
, zb
->zb_objset
, zb
->zb_object
,
296 zb
->zb_blkid
* epb
+ i
);
297 traverse_prefetch_metadata(td
,
298 &((blkptr_t
*)buf
->b_data
)[i
], czb
);
301 /* recursively visitbp() blocks below this */
302 for (i
= 0; i
< epb
; i
++) {
303 SET_BOOKMARK(czb
, zb
->zb_objset
, zb
->zb_object
,
305 zb
->zb_blkid
* epb
+ i
);
306 err
= traverse_visitbp(td
, dnp
,
307 &((blkptr_t
*)buf
->b_data
)[i
], czb
);
312 kmem_free(czb
, sizeof (zbookmark_phys_t
));
314 } else if (BP_GET_TYPE(bp
) == DMU_OT_DNODE
) {
315 uint32_t flags
= ARC_WAIT
;
317 int32_t epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
319 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
320 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
325 for (i
= 0; i
< epb
; i
++) {
326 prefetch_dnode_metadata(td
, &dnp
[i
], zb
->zb_objset
,
327 zb
->zb_blkid
* epb
+ i
);
330 /* recursively visitbp() blocks below this */
331 for (i
= 0; i
< epb
; i
++) {
332 err
= traverse_dnode(td
, &dnp
[i
], zb
->zb_objset
,
333 zb
->zb_blkid
* epb
+ i
);
337 } else if (BP_GET_TYPE(bp
) == DMU_OT_OBJSET
) {
338 uint32_t flags
= ARC_WAIT
;
342 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
343 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
348 dnp
= &osp
->os_meta_dnode
;
349 prefetch_dnode_metadata(td
, dnp
, zb
->zb_objset
,
350 DMU_META_DNODE_OBJECT
);
351 if (arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
352 prefetch_dnode_metadata(td
, &osp
->os_groupused_dnode
,
353 zb
->zb_objset
, DMU_GROUPUSED_OBJECT
);
354 prefetch_dnode_metadata(td
, &osp
->os_userused_dnode
,
355 zb
->zb_objset
, DMU_USERUSED_OBJECT
);
358 err
= traverse_dnode(td
, dnp
, zb
->zb_objset
,
359 DMU_META_DNODE_OBJECT
);
360 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
361 dnp
= &osp
->os_groupused_dnode
;
362 err
= traverse_dnode(td
, dnp
, zb
->zb_objset
,
363 DMU_GROUPUSED_OBJECT
);
365 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
366 dnp
= &osp
->os_userused_dnode
;
367 err
= traverse_dnode(td
, dnp
, zb
->zb_objset
,
368 DMU_USERUSED_OBJECT
);
373 (void) arc_buf_remove_ref(buf
, &buf
);
376 if (err
== 0 && (td
->td_flags
& TRAVERSE_POST
))
377 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
, td
->td_arg
);
379 if ((td
->td_flags
& TRAVERSE_HARD
) && (err
== EIO
|| err
== ECKSUM
)) {
381 * Ignore this disk error as requested by the HARD flag,
382 * and continue traversal.
388 * If we are stopping here, set td_resume.
390 if (td
->td_resume
!= NULL
&& err
!= 0 && !td
->td_paused
) {
391 td
->td_resume
->zb_objset
= zb
->zb_objset
;
392 td
->td_resume
->zb_object
= zb
->zb_object
;
393 td
->td_resume
->zb_level
= 0;
395 * If we have stopped on an indirect block (e.g. due to
396 * i/o error), we have not visited anything below it.
397 * Set the bookmark to the first level-0 block that we need
398 * to visit. This way, the resuming code does not need to
399 * deal with resuming from indirect blocks.
401 td
->td_resume
->zb_blkid
= zb
->zb_blkid
<<
402 (zb
->zb_level
* (dnp
->dn_indblkshift
- SPA_BLKPTRSHIFT
));
403 td
->td_paused
= B_TRUE
;
410 prefetch_dnode_metadata(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
411 uint64_t objset
, uint64_t object
)
414 zbookmark_phys_t czb
;
416 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
417 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
418 traverse_prefetch_metadata(td
, &dnp
->dn_blkptr
[j
], &czb
);
421 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
422 SET_BOOKMARK(&czb
, objset
, object
, 0, DMU_SPILL_BLKID
);
423 traverse_prefetch_metadata(td
, &dnp
->dn_spill
, &czb
);
428 traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
429 uint64_t objset
, uint64_t object
)
432 zbookmark_phys_t czb
;
434 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
435 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
436 err
= traverse_visitbp(td
, dnp
, &dnp
->dn_blkptr
[j
], &czb
);
441 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
442 SET_BOOKMARK(&czb
, objset
, object
, 0, DMU_SPILL_BLKID
);
443 err
= traverse_visitbp(td
, dnp
, &dnp
->dn_spill
, &czb
);
450 traverse_prefetcher(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
451 const zbookmark_phys_t
*zb
, const dnode_phys_t
*dnp
, void *arg
)
453 prefetch_data_t
*pfd
= arg
;
454 uint32_t aflags
= ARC_NOWAIT
| ARC_PREFETCH
;
456 ASSERT(pfd
->pd_blks_fetched
>= 0);
458 return (SET_ERROR(EINTR
));
460 if (!prefetch_needed(pfd
, bp
))
463 mutex_enter(&pfd
->pd_mtx
);
464 while (!pfd
->pd_cancel
&& pfd
->pd_blks_fetched
>= pfd
->pd_blks_max
)
465 cv_wait(&pfd
->pd_cv
, &pfd
->pd_mtx
);
466 pfd
->pd_blks_fetched
++;
467 cv_broadcast(&pfd
->pd_cv
);
468 mutex_exit(&pfd
->pd_mtx
);
470 (void) arc_read(NULL
, spa
, bp
, NULL
, NULL
, ZIO_PRIORITY_ASYNC_READ
,
471 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
, &aflags
, zb
);
477 traverse_prefetch_thread(void *arg
)
479 traverse_data_t
*td_main
= arg
;
480 traverse_data_t td
= *td_main
;
481 zbookmark_phys_t czb
;
483 td
.td_func
= traverse_prefetcher
;
484 td
.td_arg
= td_main
->td_pfd
;
487 SET_BOOKMARK(&czb
, td
.td_objset
,
488 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
489 (void) traverse_visitbp(&td
, NULL
, td
.td_rootbp
, &czb
);
491 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
492 td_main
->td_pfd
->pd_exited
= B_TRUE
;
493 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
494 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
498 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
499 * in syncing context).
502 traverse_impl(spa_t
*spa
, dsl_dataset_t
*ds
, uint64_t objset
, blkptr_t
*rootbp
,
503 uint64_t txg_start
, zbookmark_phys_t
*resume
, int flags
,
504 blkptr_cb_t func
, void *arg
)
508 zbookmark_phys_t
*czb
;
511 ASSERT(ds
== NULL
|| objset
== ds
->ds_object
);
512 ASSERT(!(flags
& TRAVERSE_PRE
) || !(flags
& TRAVERSE_POST
));
515 * The data prefetching mechanism (the prefetch thread) is incompatible
516 * with resuming from a bookmark.
518 ASSERT(resume
== NULL
|| !(flags
& TRAVERSE_PREFETCH_DATA
));
520 td
= kmem_alloc(sizeof (traverse_data_t
), KM_SLEEP
);
521 pd
= kmem_zalloc(sizeof (prefetch_data_t
), KM_SLEEP
);
522 czb
= kmem_alloc(sizeof (zbookmark_phys_t
), KM_SLEEP
);
525 td
->td_objset
= objset
;
526 td
->td_rootbp
= rootbp
;
527 td
->td_min_txg
= txg_start
;
528 td
->td_resume
= resume
;
532 td
->td_flags
= flags
;
533 td
->td_paused
= B_FALSE
;
535 pd
->pd_blks_max
= zfs_pd_blks_max
;
536 pd
->pd_flags
= flags
;
537 mutex_init(&pd
->pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
538 cv_init(&pd
->pd_cv
, NULL
, CV_DEFAULT
, NULL
);
540 SET_BOOKMARK(czb
, td
->td_objset
,
541 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
543 /* See comment on ZIL traversal in dsl_scan_visitds. */
544 if (ds
!= NULL
&& !dsl_dataset_is_snapshot(ds
) && !BP_IS_HOLE(rootbp
)) {
545 uint32_t flags
= ARC_WAIT
;
549 err
= arc_read(NULL
, td
->td_spa
, rootbp
,
550 arc_getbuf_func
, &buf
,
551 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, czb
);
556 traverse_zil(td
, &osp
->os_zil_header
);
557 (void) arc_buf_remove_ref(buf
, &buf
);
560 if (!(flags
& TRAVERSE_PREFETCH_DATA
) ||
561 0 == taskq_dispatch(system_taskq
, traverse_prefetch_thread
,
563 pd
->pd_exited
= B_TRUE
;
565 err
= traverse_visitbp(td
, NULL
, rootbp
, czb
);
567 mutex_enter(&pd
->pd_mtx
);
568 pd
->pd_cancel
= B_TRUE
;
569 cv_broadcast(&pd
->pd_cv
);
570 while (!pd
->pd_exited
)
571 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
572 mutex_exit(&pd
->pd_mtx
);
574 mutex_destroy(&pd
->pd_mtx
);
575 cv_destroy(&pd
->pd_cv
);
577 kmem_free(czb
, sizeof (zbookmark_phys_t
));
578 kmem_free(pd
, sizeof (struct prefetch_data
));
579 kmem_free(td
, sizeof (struct traverse_data
));
585 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
586 * in syncing context).
589 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int flags
,
590 blkptr_cb_t func
, void *arg
)
592 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
, ds
->ds_object
,
593 &ds
->ds_phys
->ds_bp
, txg_start
, NULL
, flags
, func
, arg
));
597 traverse_dataset_destroyed(spa_t
*spa
, blkptr_t
*blkptr
,
598 uint64_t txg_start
, zbookmark_phys_t
*resume
, int flags
,
599 blkptr_cb_t func
, void *arg
)
601 return (traverse_impl(spa
, NULL
, ZB_DESTROYED_OBJSET
,
602 blkptr
, txg_start
, resume
, flags
, func
, arg
));
606 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
609 traverse_pool(spa_t
*spa
, uint64_t txg_start
, int flags
,
610 blkptr_cb_t func
, void *arg
)
614 dsl_pool_t
*dp
= spa_get_dsl(spa
);
615 objset_t
*mos
= dp
->dp_meta_objset
;
616 boolean_t hard
= (flags
& TRAVERSE_HARD
);
619 err
= traverse_impl(spa
, NULL
, 0, spa_get_rootblkptr(spa
),
620 txg_start
, NULL
, flags
, func
, arg
);
624 /* visit each dataset */
625 for (obj
= 1; err
== 0;
626 err
= dmu_object_next(mos
, &obj
, FALSE
, txg_start
)) {
627 dmu_object_info_t doi
;
629 err
= dmu_object_info(mos
, obj
, &doi
);
636 if (doi
.doi_bonus_type
== DMU_OT_DSL_DATASET
) {
638 uint64_t txg
= txg_start
;
640 dsl_pool_config_enter(dp
, FTAG
);
641 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
642 dsl_pool_config_exit(dp
, FTAG
);
648 if (ds
->ds_phys
->ds_prev_snap_txg
> txg
)
649 txg
= ds
->ds_phys
->ds_prev_snap_txg
;
650 err
= traverse_dataset(ds
, txg
, flags
, func
, arg
);
651 dsl_dataset_rele(ds
, FTAG
);
661 #if defined(_KERNEL) && defined(HAVE_SPL)
662 EXPORT_SYMBOL(traverse_dataset
);
663 EXPORT_SYMBOL(traverse_pool
);
665 module_param(zfs_pd_blks_max
, int, 0644);
666 MODULE_PARM_DESC(zfs_pd_blks_max
, "Max number of blocks to prefetch");