4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
35 #include <sys/dmu_impl.h>
37 #include <sys/sa_impl.h>
38 #include <sys/callb.h>
40 int zfs_pd_blks_max
= 100;
42 typedef struct prefetch_data
{
52 typedef struct traverse_data
{
57 zbookmark_t
*td_resume
;
59 prefetch_data_t
*td_pfd
;
64 static int traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
65 arc_buf_t
*buf
, uint64_t objset
, uint64_t object
);
68 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
70 traverse_data_t
*td
= arg
;
73 if (bp
->blk_birth
== 0)
76 if (claim_txg
== 0 && bp
->blk_birth
>= spa_first_txg(td
->td_spa
))
79 SET_BOOKMARK(&zb
, td
->td_objset
, ZB_ZIL_OBJECT
, ZB_ZIL_LEVEL
,
80 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]);
82 (void) td
->td_func(td
->td_spa
, zilog
, bp
, NULL
, &zb
, NULL
, td
->td_arg
);
88 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
90 traverse_data_t
*td
= arg
;
92 if (lrc
->lrc_txtype
== TX_WRITE
) {
93 lr_write_t
*lr
= (lr_write_t
*)lrc
;
94 blkptr_t
*bp
= &lr
->lr_blkptr
;
97 if (bp
->blk_birth
== 0)
100 if (claim_txg
== 0 || bp
->blk_birth
< claim_txg
)
103 SET_BOOKMARK(&zb
, td
->td_objset
, lr
->lr_foid
,
104 ZB_ZIL_LEVEL
, lr
->lr_offset
/ BP_GET_LSIZE(bp
));
106 (void) td
->td_func(td
->td_spa
, zilog
, bp
, NULL
, &zb
, NULL
,
113 traverse_zil(traverse_data_t
*td
, zil_header_t
*zh
)
115 uint64_t claim_txg
= zh
->zh_claim_txg
;
119 * We only want to visit blocks that have been claimed but not yet
120 * replayed; plus, in read-only mode, blocks that are already stable.
122 if (claim_txg
== 0 && spa_writeable(td
->td_spa
))
125 zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
127 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
133 typedef enum resume_skip
{
140 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
141 * the block indicated by zb does not need to be visited at all. Returns
142 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
143 * resume point. This indicates that this block should be visited but not its
144 * children (since they must have been visited in a previous traversal).
145 * Otherwise returns RESUME_SKIP_NONE.
148 resume_skip_check(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
149 const zbookmark_t
*zb
)
151 if (td
->td_resume
!= NULL
&& !ZB_IS_ZERO(td
->td_resume
)) {
153 * If we already visited this bp & everything below,
154 * don't bother doing it again.
156 if (zbookmark_is_before(dnp
, zb
, td
->td_resume
))
157 return (RESUME_SKIP_ALL
);
160 * If we found the block we're trying to resume from, zero
161 * the bookmark out to indicate that we have resumed.
163 ASSERT3U(zb
->zb_object
, <=, td
->td_resume
->zb_object
);
164 if (bcmp(zb
, td
->td_resume
, sizeof (*zb
)) == 0) {
165 bzero(td
->td_resume
, sizeof (*zb
));
166 if (td
->td_flags
& TRAVERSE_POST
)
167 return (RESUME_SKIP_CHILDREN
);
170 return (RESUME_SKIP_NONE
);
174 traverse_pause(traverse_data_t
*td
, const zbookmark_t
*zb
)
176 ASSERT(td
->td_resume
!= NULL
);
177 ASSERT3U(zb
->zb_level
, ==, 0);
178 bcopy(zb
, td
->td_resume
, sizeof (*td
->td_resume
));
182 traverse_visitbp(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
183 arc_buf_t
*pbuf
, blkptr_t
*bp
, const zbookmark_t
*zb
)
186 int err
= 0, lasterr
= 0;
187 arc_buf_t
*buf
= NULL
;
188 prefetch_data_t
*pd
= td
->td_pfd
;
189 boolean_t hard
= td
->td_flags
& TRAVERSE_HARD
;
190 boolean_t pause
= B_FALSE
;
192 switch (resume_skip_check(td
, dnp
, zb
)) {
193 case RESUME_SKIP_ALL
:
195 case RESUME_SKIP_CHILDREN
:
197 case RESUME_SKIP_NONE
:
203 if (BP_IS_HOLE(bp
)) {
204 err
= td
->td_func(td
->td_spa
, NULL
, NULL
, pbuf
, zb
, dnp
,
209 if (bp
->blk_birth
<= td
->td_min_txg
)
212 if (pd
&& !pd
->pd_exited
&&
213 ((pd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
214 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0)) {
215 mutex_enter(&pd
->pd_mtx
);
216 ASSERT(pd
->pd_blks_fetched
>= 0);
217 while (pd
->pd_blks_fetched
== 0 && !pd
->pd_exited
)
218 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
219 pd
->pd_blks_fetched
--;
220 cv_broadcast(&pd
->pd_cv
);
221 mutex_exit(&pd
->pd_mtx
);
224 if (td
->td_flags
& TRAVERSE_PRE
) {
225 err
= td
->td_func(td
->td_spa
, NULL
, bp
, pbuf
, zb
, dnp
,
227 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
230 pause
= B_TRUE
; /* handle pausing at a common point */
235 if (BP_GET_LEVEL(bp
) > 0) {
236 uint32_t flags
= ARC_WAIT
;
239 int epb
= BP_GET_LSIZE(bp
) >> SPA_BLKPTRSHIFT
;
241 err
= dsl_read(NULL
, td
->td_spa
, bp
, pbuf
,
242 arc_getbuf_func
, &buf
,
243 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
247 /* recursively visitbp() blocks below this */
249 for (i
= 0; i
< epb
; i
++, cbp
++) {
250 SET_BOOKMARK(&czb
, zb
->zb_objset
, zb
->zb_object
,
252 zb
->zb_blkid
* epb
+ i
);
253 err
= traverse_visitbp(td
, dnp
, buf
, cbp
, &czb
);
260 } else if (BP_GET_TYPE(bp
) == DMU_OT_DNODE
) {
261 uint32_t flags
= ARC_WAIT
;
263 int epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
265 err
= dsl_read(NULL
, td
->td_spa
, bp
, pbuf
,
266 arc_getbuf_func
, &buf
,
267 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
271 /* recursively visitbp() blocks below this */
273 for (i
= 0; i
< epb
; i
++, dnp
++) {
274 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
275 zb
->zb_blkid
* epb
+ i
);
282 } else if (BP_GET_TYPE(bp
) == DMU_OT_OBJSET
) {
283 uint32_t flags
= ARC_WAIT
;
287 err
= dsl_read_nolock(NULL
, td
->td_spa
, bp
,
288 arc_getbuf_func
, &buf
,
289 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
294 dnp
= &osp
->os_meta_dnode
;
295 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
296 DMU_META_DNODE_OBJECT
);
301 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
302 dnp
= &osp
->os_userused_dnode
;
303 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
304 DMU_USERUSED_OBJECT
);
310 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
311 dnp
= &osp
->os_groupused_dnode
;
312 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
313 DMU_GROUPUSED_OBJECT
);
318 (void) arc_buf_remove_ref(buf
, &buf
);
321 if (err
== 0 && lasterr
== 0 && (td
->td_flags
& TRAVERSE_POST
)) {
322 err
= td
->td_func(td
->td_spa
, NULL
, bp
, pbuf
, zb
, dnp
,
328 if (pause
&& td
->td_resume
!= NULL
) {
329 ASSERT3U(err
, ==, ERESTART
);
331 traverse_pause(td
, zb
);
334 return (err
!= 0 ? err
: lasterr
);
338 traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
339 arc_buf_t
*buf
, uint64_t objset
, uint64_t object
)
341 int j
, err
= 0, lasterr
= 0;
343 boolean_t hard
= (td
->td_flags
& TRAVERSE_HARD
);
345 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
346 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
347 err
= traverse_visitbp(td
, dnp
, buf
,
348 (blkptr_t
*)&dnp
->dn_blkptr
[j
], &czb
);
356 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
357 SET_BOOKMARK(&czb
, objset
,
358 object
, 0, DMU_SPILL_BLKID
);
359 err
= traverse_visitbp(td
, dnp
, buf
,
360 (blkptr_t
*)&dnp
->dn_spill
, &czb
);
367 return (err
!= 0 ? err
: lasterr
);
372 traverse_prefetcher(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
373 arc_buf_t
*pbuf
, const zbookmark_t
*zb
, const dnode_phys_t
*dnp
,
376 prefetch_data_t
*pfd
= arg
;
377 uint32_t aflags
= ARC_NOWAIT
| ARC_PREFETCH
;
379 ASSERT(pfd
->pd_blks_fetched
>= 0);
383 if (bp
== NULL
|| !((pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
384 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0) ||
385 BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
)
388 mutex_enter(&pfd
->pd_mtx
);
389 while (!pfd
->pd_cancel
&& pfd
->pd_blks_fetched
>= pfd
->pd_blks_max
)
390 cv_wait(&pfd
->pd_cv
, &pfd
->pd_mtx
);
391 pfd
->pd_blks_fetched
++;
392 cv_broadcast(&pfd
->pd_cv
);
393 mutex_exit(&pfd
->pd_mtx
);
395 (void) dsl_read(NULL
, spa
, bp
, pbuf
, NULL
, NULL
,
396 ZIO_PRIORITY_ASYNC_READ
,
397 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
,
404 traverse_prefetch_thread(void *arg
)
406 traverse_data_t
*td_main
= arg
;
407 traverse_data_t td
= *td_main
;
410 td
.td_func
= traverse_prefetcher
;
411 td
.td_arg
= td_main
->td_pfd
;
414 SET_BOOKMARK(&czb
, td
.td_objset
,
415 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
416 (void) traverse_visitbp(&td
, NULL
, NULL
, td
.td_rootbp
, &czb
);
418 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
419 td_main
->td_pfd
->pd_exited
= B_TRUE
;
420 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
421 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
425 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
426 * in syncing context).
429 traverse_impl(spa_t
*spa
, dsl_dataset_t
*ds
, uint64_t objset
, blkptr_t
*rootbp
,
430 uint64_t txg_start
, zbookmark_t
*resume
, int flags
,
431 blkptr_cb_t func
, void *arg
)
438 ASSERT(ds
== NULL
|| objset
== ds
->ds_object
);
439 ASSERT(!(flags
& TRAVERSE_PRE
) || !(flags
& TRAVERSE_POST
));
441 td
= kmem_alloc(sizeof(traverse_data_t
), KM_PUSHPAGE
);
442 pd
= kmem_zalloc(sizeof(prefetch_data_t
), KM_PUSHPAGE
);
443 czb
= kmem_alloc(sizeof(zbookmark_t
), KM_PUSHPAGE
);
446 td
->td_objset
= objset
;
447 td
->td_rootbp
= rootbp
;
448 td
->td_min_txg
= txg_start
;
449 td
->td_resume
= resume
;
453 td
->td_flags
= flags
;
455 pd
->pd_blks_max
= zfs_pd_blks_max
;
456 pd
->pd_flags
= flags
;
457 mutex_init(&pd
->pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
458 cv_init(&pd
->pd_cv
, NULL
, CV_DEFAULT
, NULL
);
460 /* See comment on ZIL traversal in dsl_scan_visitds. */
461 if (ds
!= NULL
&& !dsl_dataset_is_snapshot(ds
)) {
464 err
= dmu_objset_from_ds(ds
, &os
);
468 traverse_zil(td
, &os
->os_zil_header
);
471 if (!(flags
& TRAVERSE_PREFETCH
) ||
472 0 == taskq_dispatch(system_taskq
, traverse_prefetch_thread
,
474 pd
->pd_exited
= B_TRUE
;
476 SET_BOOKMARK(czb
, td
->td_objset
,
477 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
478 err
= traverse_visitbp(td
, NULL
, NULL
, rootbp
, czb
);
480 mutex_enter(&pd
->pd_mtx
);
481 pd
->pd_cancel
= B_TRUE
;
482 cv_broadcast(&pd
->pd_cv
);
483 while (!pd
->pd_exited
)
484 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
485 mutex_exit(&pd
->pd_mtx
);
487 mutex_destroy(&pd
->pd_mtx
);
488 cv_destroy(&pd
->pd_cv
);
490 kmem_free(czb
, sizeof(zbookmark_t
));
491 kmem_free(pd
, sizeof(struct prefetch_data
));
492 kmem_free(td
, sizeof(struct traverse_data
));
498 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
499 * in syncing context).
502 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int flags
,
503 blkptr_cb_t func
, void *arg
)
505 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
, ds
->ds_object
,
506 &ds
->ds_phys
->ds_bp
, txg_start
, NULL
, flags
, func
, arg
));
510 traverse_dataset_destroyed(spa_t
*spa
, blkptr_t
*blkptr
,
511 uint64_t txg_start
, zbookmark_t
*resume
, int flags
,
512 blkptr_cb_t func
, void *arg
)
514 return (traverse_impl(spa
, NULL
, ZB_DESTROYED_OBJSET
,
515 blkptr
, txg_start
, resume
, flags
, func
, arg
));
519 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
522 traverse_pool(spa_t
*spa
, uint64_t txg_start
, int flags
,
523 blkptr_cb_t func
, void *arg
)
525 int err
, lasterr
= 0;
527 dsl_pool_t
*dp
= spa_get_dsl(spa
);
528 objset_t
*mos
= dp
->dp_meta_objset
;
529 boolean_t hard
= (flags
& TRAVERSE_HARD
);
532 err
= traverse_impl(spa
, NULL
, 0, spa_get_rootblkptr(spa
),
533 txg_start
, NULL
, flags
, func
, arg
);
537 /* visit each dataset */
538 for (obj
= 1; err
== 0 || (err
!= ESRCH
&& hard
);
539 err
= dmu_object_next(mos
, &obj
, FALSE
, txg_start
)) {
540 dmu_object_info_t doi
;
542 err
= dmu_object_info(mos
, obj
, &doi
);
550 if (doi
.doi_type
== DMU_OT_DSL_DATASET
) {
552 uint64_t txg
= txg_start
;
554 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
555 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
556 rw_exit(&dp
->dp_config_rwlock
);
563 if (ds
->ds_phys
->ds_prev_snap_txg
> txg
)
564 txg
= ds
->ds_phys
->ds_prev_snap_txg
;
565 err
= traverse_dataset(ds
, txg
, flags
, func
, arg
);
566 dsl_dataset_rele(ds
, FTAG
);
576 return (err
!= 0 ? err
: lasterr
);
579 #if defined(_KERNEL) && defined(HAVE_SPL)
580 EXPORT_SYMBOL(traverse_dataset
);
581 EXPORT_SYMBOL(traverse_pool
);
583 module_param(zfs_pd_blks_max
, int, 0644);
584 MODULE_PARM_DESC(zfs_pd_blks_max
, "Max number of blocks to prefetch");