4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
35 #include <sys/dmu_impl.h>
37 #include <sys/sa_impl.h>
38 #include <sys/callb.h>
40 int zfs_pd_blks_max
= 100;
42 typedef struct prefetch_data
{
52 typedef struct traverse_data
{
57 zbookmark_t
*td_resume
;
59 prefetch_data_t
*td_pfd
;
64 static int traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
65 uint64_t objset
, uint64_t object
);
66 static void prefetch_dnode_metadata(traverse_data_t
*td
, const dnode_phys_t
*,
67 uint64_t objset
, uint64_t object
);
70 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
72 traverse_data_t
*td
= arg
;
75 if (bp
->blk_birth
== 0)
78 if (claim_txg
== 0 && bp
->blk_birth
>= spa_first_txg(td
->td_spa
))
81 SET_BOOKMARK(&zb
, td
->td_objset
, ZB_ZIL_OBJECT
, ZB_ZIL_LEVEL
,
82 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]);
84 (void) td
->td_func(td
->td_spa
, zilog
, bp
, &zb
, NULL
, td
->td_arg
);
90 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
92 traverse_data_t
*td
= arg
;
94 if (lrc
->lrc_txtype
== TX_WRITE
) {
95 lr_write_t
*lr
= (lr_write_t
*)lrc
;
96 blkptr_t
*bp
= &lr
->lr_blkptr
;
99 if (bp
->blk_birth
== 0)
102 if (claim_txg
== 0 || bp
->blk_birth
< claim_txg
)
105 SET_BOOKMARK(&zb
, td
->td_objset
, lr
->lr_foid
,
106 ZB_ZIL_LEVEL
, lr
->lr_offset
/ BP_GET_LSIZE(bp
));
108 (void) td
->td_func(td
->td_spa
, zilog
, bp
, &zb
, NULL
,
115 traverse_zil(traverse_data_t
*td
, zil_header_t
*zh
)
117 uint64_t claim_txg
= zh
->zh_claim_txg
;
121 * We only want to visit blocks that have been claimed but not yet
122 * replayed; plus, in read-only mode, blocks that are already stable.
124 if (claim_txg
== 0 && spa_writeable(td
->td_spa
))
127 zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
129 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
135 typedef enum resume_skip
{
142 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
143 * the block indicated by zb does not need to be visited at all. Returns
144 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
145 * resume point. This indicates that this block should be visited but not its
146 * children (since they must have been visited in a previous traversal).
147 * Otherwise returns RESUME_SKIP_NONE.
150 resume_skip_check(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
151 const zbookmark_t
*zb
)
153 if (td
->td_resume
!= NULL
&& !ZB_IS_ZERO(td
->td_resume
)) {
155 * If we already visited this bp & everything below,
156 * don't bother doing it again.
158 if (zbookmark_is_before(dnp
, zb
, td
->td_resume
))
159 return (RESUME_SKIP_ALL
);
162 * If we found the block we're trying to resume from, zero
163 * the bookmark out to indicate that we have resumed.
165 ASSERT3U(zb
->zb_object
, <=, td
->td_resume
->zb_object
);
166 if (bcmp(zb
, td
->td_resume
, sizeof (*zb
)) == 0) {
167 bzero(td
->td_resume
, sizeof (*zb
));
168 if (td
->td_flags
& TRAVERSE_POST
)
169 return (RESUME_SKIP_CHILDREN
);
172 return (RESUME_SKIP_NONE
);
176 traverse_pause(traverse_data_t
*td
, const zbookmark_t
*zb
)
178 ASSERT(td
->td_resume
!= NULL
);
179 ASSERT0(zb
->zb_level
);
180 bcopy(zb
, td
->td_resume
, sizeof (*td
->td_resume
));
184 traverse_prefetch_metadata(traverse_data_t
*td
,
185 const blkptr_t
*bp
, const zbookmark_t
*zb
)
187 uint32_t flags
= ARC_NOWAIT
| ARC_PREFETCH
;
189 if (!(td
->td_flags
& TRAVERSE_PREFETCH_METADATA
))
192 * If we are in the process of resuming, don't prefetch, because
193 * some children will not be needed (and in fact may have already
196 if (td
->td_resume
!= NULL
&& !ZB_IS_ZERO(td
->td_resume
))
198 if (BP_IS_HOLE(bp
) || bp
->blk_birth
<= td
->td_min_txg
)
200 if (BP_GET_LEVEL(bp
) == 0 && BP_GET_TYPE(bp
) != DMU_OT_DNODE
)
203 (void) arc_read(NULL
, td
->td_spa
, bp
, NULL
, NULL
,
204 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
208 traverse_visitbp(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
209 const blkptr_t
*bp
, const zbookmark_t
*zb
)
212 int err
= 0, lasterr
= 0;
213 arc_buf_t
*buf
= NULL
;
214 prefetch_data_t
*pd
= td
->td_pfd
;
215 boolean_t hard
= td
->td_flags
& TRAVERSE_HARD
;
216 boolean_t pause
= B_FALSE
;
218 switch (resume_skip_check(td
, dnp
, zb
)) {
219 case RESUME_SKIP_ALL
:
221 case RESUME_SKIP_CHILDREN
:
223 case RESUME_SKIP_NONE
:
229 if (BP_IS_HOLE(bp
)) {
230 err
= td
->td_func(td
->td_spa
, NULL
, NULL
, zb
, dnp
, td
->td_arg
);
234 if (bp
->blk_birth
<= td
->td_min_txg
)
237 if (pd
&& !pd
->pd_exited
&&
238 ((pd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
239 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0)) {
240 mutex_enter(&pd
->pd_mtx
);
241 ASSERT(pd
->pd_blks_fetched
>= 0);
242 while (pd
->pd_blks_fetched
== 0 && !pd
->pd_exited
)
243 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
244 pd
->pd_blks_fetched
--;
245 cv_broadcast(&pd
->pd_cv
);
246 mutex_exit(&pd
->pd_mtx
);
249 if (td
->td_flags
& TRAVERSE_PRE
) {
250 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
,
252 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
255 pause
= B_TRUE
; /* handle pausing at a common point */
260 if (BP_GET_LEVEL(bp
) > 0) {
261 uint32_t flags
= ARC_WAIT
;
264 int epb
= BP_GET_LSIZE(bp
) >> SPA_BLKPTRSHIFT
;
266 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
267 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
272 for (i
= 0; i
< epb
; i
++) {
273 SET_BOOKMARK(&czb
, zb
->zb_objset
, zb
->zb_object
,
275 zb
->zb_blkid
* epb
+ i
);
276 traverse_prefetch_metadata(td
, &cbp
[i
], &czb
);
279 /* recursively visitbp() blocks below this */
280 for (i
= 0; i
< epb
; i
++) {
281 SET_BOOKMARK(&czb
, zb
->zb_objset
, zb
->zb_object
,
283 zb
->zb_blkid
* epb
+ i
);
284 err
= traverse_visitbp(td
, dnp
, &cbp
[i
], &czb
);
291 } else if (BP_GET_TYPE(bp
) == DMU_OT_DNODE
) {
292 uint32_t flags
= ARC_WAIT
;
294 int epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
296 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
297 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
302 for (i
= 0; i
< epb
; i
++) {
303 prefetch_dnode_metadata(td
, &dnp
[i
], zb
->zb_objset
,
304 zb
->zb_blkid
* epb
+ i
);
307 /* recursively visitbp() blocks below this */
308 for (i
= 0; i
< epb
; i
++) {
309 err
= traverse_dnode(td
, &dnp
[i
], zb
->zb_objset
,
310 zb
->zb_blkid
* epb
+ i
);
317 } else if (BP_GET_TYPE(bp
) == DMU_OT_OBJSET
) {
318 uint32_t flags
= ARC_WAIT
;
322 err
= arc_read(NULL
, td
->td_spa
, bp
, arc_getbuf_func
, &buf
,
323 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
328 dnp
= &osp
->os_meta_dnode
;
329 prefetch_dnode_metadata(td
, dnp
, zb
->zb_objset
,
330 DMU_META_DNODE_OBJECT
);
331 if (arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
332 prefetch_dnode_metadata(td
, &osp
->os_userused_dnode
,
333 zb
->zb_objset
, DMU_USERUSED_OBJECT
);
334 prefetch_dnode_metadata(td
, &osp
->os_groupused_dnode
,
335 zb
->zb_objset
, DMU_USERUSED_OBJECT
);
338 err
= traverse_dnode(td
, dnp
, zb
->zb_objset
,
339 DMU_META_DNODE_OBJECT
);
344 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
345 dnp
= &osp
->os_userused_dnode
;
346 err
= traverse_dnode(td
, dnp
, zb
->zb_objset
,
347 DMU_USERUSED_OBJECT
);
353 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
354 dnp
= &osp
->os_groupused_dnode
;
355 err
= traverse_dnode(td
, dnp
, zb
->zb_objset
,
356 DMU_GROUPUSED_OBJECT
);
361 (void) arc_buf_remove_ref(buf
, &buf
);
364 if (err
== 0 && lasterr
== 0 && (td
->td_flags
& TRAVERSE_POST
)) {
365 err
= td
->td_func(td
->td_spa
, NULL
, bp
, zb
, dnp
, td
->td_arg
);
370 if (pause
&& td
->td_resume
!= NULL
) {
371 ASSERT3U(err
, ==, ERESTART
);
373 traverse_pause(td
, zb
);
376 return (err
!= 0 ? err
: lasterr
);
380 prefetch_dnode_metadata(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
381 uint64_t objset
, uint64_t object
)
386 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
387 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
388 traverse_prefetch_metadata(td
, &dnp
->dn_blkptr
[j
], &czb
);
391 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
392 SET_BOOKMARK(&czb
, objset
, object
, 0, DMU_SPILL_BLKID
);
393 traverse_prefetch_metadata(td
, &dnp
->dn_spill
, &czb
);
398 traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
399 uint64_t objset
, uint64_t object
)
401 int j
, err
= 0, lasterr
= 0;
403 boolean_t hard
= (td
->td_flags
& TRAVERSE_HARD
);
405 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
406 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
407 err
= traverse_visitbp(td
, dnp
, &dnp
->dn_blkptr
[j
], &czb
);
415 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
416 SET_BOOKMARK(&czb
, objset
, object
, 0, DMU_SPILL_BLKID
);
417 err
= traverse_visitbp(td
, dnp
, &dnp
->dn_spill
, &czb
);
424 return (err
!= 0 ? err
: lasterr
);
429 traverse_prefetcher(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
430 const zbookmark_t
*zb
, const dnode_phys_t
*dnp
, void *arg
)
432 prefetch_data_t
*pfd
= arg
;
433 uint32_t aflags
= ARC_NOWAIT
| ARC_PREFETCH
;
435 ASSERT(pfd
->pd_blks_fetched
>= 0);
439 if (bp
== NULL
|| !((pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
440 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0) ||
441 BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
)
444 mutex_enter(&pfd
->pd_mtx
);
445 while (!pfd
->pd_cancel
&& pfd
->pd_blks_fetched
>= pfd
->pd_blks_max
)
446 cv_wait(&pfd
->pd_cv
, &pfd
->pd_mtx
);
447 pfd
->pd_blks_fetched
++;
448 cv_broadcast(&pfd
->pd_cv
);
449 mutex_exit(&pfd
->pd_mtx
);
451 (void) arc_read(NULL
, spa
, bp
, NULL
, NULL
, ZIO_PRIORITY_ASYNC_READ
,
452 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
, &aflags
, zb
);
458 traverse_prefetch_thread(void *arg
)
460 traverse_data_t
*td_main
= arg
;
461 traverse_data_t td
= *td_main
;
464 td
.td_func
= traverse_prefetcher
;
465 td
.td_arg
= td_main
->td_pfd
;
468 SET_BOOKMARK(&czb
, td
.td_objset
,
469 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
470 (void) traverse_visitbp(&td
, NULL
, td
.td_rootbp
, &czb
);
472 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
473 td_main
->td_pfd
->pd_exited
= B_TRUE
;
474 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
475 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
479 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
480 * in syncing context).
483 traverse_impl(spa_t
*spa
, dsl_dataset_t
*ds
, uint64_t objset
, blkptr_t
*rootbp
,
484 uint64_t txg_start
, zbookmark_t
*resume
, int flags
,
485 blkptr_cb_t func
, void *arg
)
492 ASSERT(ds
== NULL
|| objset
== ds
->ds_object
);
493 ASSERT(!(flags
& TRAVERSE_PRE
) || !(flags
& TRAVERSE_POST
));
496 * The data prefetching mechanism (the prefetch thread) is incompatible
497 * with resuming from a bookmark.
499 ASSERT(resume
== NULL
|| !(flags
& TRAVERSE_PREFETCH_DATA
));
501 td
= kmem_alloc(sizeof(traverse_data_t
), KM_PUSHPAGE
);
502 pd
= kmem_zalloc(sizeof(prefetch_data_t
), KM_PUSHPAGE
);
503 czb
= kmem_alloc(sizeof(zbookmark_t
), KM_PUSHPAGE
);
506 td
->td_objset
= objset
;
507 td
->td_rootbp
= rootbp
;
508 td
->td_min_txg
= txg_start
;
509 td
->td_resume
= resume
;
513 td
->td_flags
= flags
;
515 pd
->pd_blks_max
= zfs_pd_blks_max
;
516 pd
->pd_flags
= flags
;
517 mutex_init(&pd
->pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
518 cv_init(&pd
->pd_cv
, NULL
, CV_DEFAULT
, NULL
);
520 /* See comment on ZIL traversal in dsl_scan_visitds. */
521 if (ds
!= NULL
&& !dsl_dataset_is_snapshot(ds
) && !BP_IS_HOLE(rootbp
)) {
522 uint32_t flags
= ARC_WAIT
;
526 err
= arc_read(NULL
, td
->td_spa
, rootbp
,
527 arc_getbuf_func
, &buf
,
528 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, NULL
);
533 traverse_zil(td
, &osp
->os_zil_header
);
534 (void) arc_buf_remove_ref(buf
, &buf
);
537 if (!(flags
& TRAVERSE_PREFETCH_DATA
) ||
538 0 == taskq_dispatch(system_taskq
, traverse_prefetch_thread
,
540 pd
->pd_exited
= B_TRUE
;
542 SET_BOOKMARK(czb
, td
->td_objset
,
543 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
544 err
= traverse_visitbp(td
, NULL
, rootbp
, czb
);
546 mutex_enter(&pd
->pd_mtx
);
547 pd
->pd_cancel
= B_TRUE
;
548 cv_broadcast(&pd
->pd_cv
);
549 while (!pd
->pd_exited
)
550 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
551 mutex_exit(&pd
->pd_mtx
);
553 mutex_destroy(&pd
->pd_mtx
);
554 cv_destroy(&pd
->pd_cv
);
556 kmem_free(czb
, sizeof(zbookmark_t
));
557 kmem_free(pd
, sizeof(struct prefetch_data
));
558 kmem_free(td
, sizeof(struct traverse_data
));
564 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
565 * in syncing context).
568 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int flags
,
569 blkptr_cb_t func
, void *arg
)
571 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
, ds
->ds_object
,
572 &ds
->ds_phys
->ds_bp
, txg_start
, NULL
, flags
, func
, arg
));
576 traverse_dataset_destroyed(spa_t
*spa
, blkptr_t
*blkptr
,
577 uint64_t txg_start
, zbookmark_t
*resume
, int flags
,
578 blkptr_cb_t func
, void *arg
)
580 return (traverse_impl(spa
, NULL
, ZB_DESTROYED_OBJSET
,
581 blkptr
, txg_start
, resume
, flags
, func
, arg
));
585 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
588 traverse_pool(spa_t
*spa
, uint64_t txg_start
, int flags
,
589 blkptr_cb_t func
, void *arg
)
591 int err
, lasterr
= 0;
593 dsl_pool_t
*dp
= spa_get_dsl(spa
);
594 objset_t
*mos
= dp
->dp_meta_objset
;
595 boolean_t hard
= (flags
& TRAVERSE_HARD
);
598 err
= traverse_impl(spa
, NULL
, 0, spa_get_rootblkptr(spa
),
599 txg_start
, NULL
, flags
, func
, arg
);
603 /* visit each dataset */
604 for (obj
= 1; err
== 0 || (err
!= ESRCH
&& hard
);
605 err
= dmu_object_next(mos
, &obj
, FALSE
, txg_start
)) {
606 dmu_object_info_t doi
;
608 err
= dmu_object_info(mos
, obj
, &doi
);
616 if (doi
.doi_type
== DMU_OT_DSL_DATASET
) {
618 uint64_t txg
= txg_start
;
620 dsl_pool_config_enter(dp
, FTAG
);
621 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
622 dsl_pool_config_exit(dp
, FTAG
);
629 if (ds
->ds_phys
->ds_prev_snap_txg
> txg
)
630 txg
= ds
->ds_phys
->ds_prev_snap_txg
;
631 err
= traverse_dataset(ds
, txg
, flags
, func
, arg
);
632 dsl_dataset_rele(ds
, FTAG
);
642 return (err
!= 0 ? err
: lasterr
);
645 #if defined(_KERNEL) && defined(HAVE_SPL)
646 EXPORT_SYMBOL(traverse_dataset
);
647 EXPORT_SYMBOL(traverse_pool
);
649 module_param(zfs_pd_blks_max
, int, 0644);
650 MODULE_PARM_DESC(zfs_pd_blks_max
, "Max number of blocks to prefetch");