4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/dmu_objset.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/dnode.h>
34 #include <sys/dmu_impl.h>
36 #include <sys/sa_impl.h>
37 #include <sys/callb.h>
39 int zfs_pd_blks_max
= 100;
41 typedef struct prefetch_data
{
51 typedef struct traverse_data
{
57 prefetch_data_t
*td_pfd
;
62 typedef struct traverse_visitbp_data
{
63 /* Function arguments */
64 traverse_data_t
*tv_td
;
65 const dnode_phys_t
*tv_dnp
;
68 const zbookmark_t
*tv_zb
;
70 prefetch_data_t
*tv_pd
;
74 objset_phys_t
*tv_osp
;
75 dnode_phys_t
*tv_ldnp
;
83 } traverse_visitbp_data_t
;
85 static inline int traverse_visitbp(traverse_data_t
*td
, const
86 dnode_phys_t
*dnp
, arc_buf_t
*pbuf
, blkptr_t
*bp
, const zbookmark_t
*zb
);
87 static int traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
88 arc_buf_t
*buf
, uint64_t objset
, uint64_t object
);
91 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
93 traverse_data_t
*td
= arg
;
96 if (bp
->blk_birth
== 0)
99 if (claim_txg
== 0 && bp
->blk_birth
>= spa_first_txg(td
->td_spa
))
102 SET_BOOKMARK(&zb
, td
->td_objset
, ZB_ZIL_OBJECT
, ZB_ZIL_LEVEL
,
103 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]);
105 (void) td
->td_func(td
->td_spa
, zilog
, bp
, NULL
, &zb
, NULL
, td
->td_arg
);
111 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
113 traverse_data_t
*td
= arg
;
115 if (lrc
->lrc_txtype
== TX_WRITE
) {
116 lr_write_t
*lr
= (lr_write_t
*)lrc
;
117 blkptr_t
*bp
= &lr
->lr_blkptr
;
120 if (bp
->blk_birth
== 0)
123 if (claim_txg
== 0 || bp
->blk_birth
< claim_txg
)
126 SET_BOOKMARK(&zb
, td
->td_objset
, lr
->lr_foid
,
127 ZB_ZIL_LEVEL
, lr
->lr_offset
/ BP_GET_LSIZE(bp
));
129 (void) td
->td_func(td
->td_spa
, zilog
, bp
, NULL
, &zb
, NULL
,
136 traverse_zil(traverse_data_t
*td
, zil_header_t
*zh
)
138 uint64_t claim_txg
= zh
->zh_claim_txg
;
142 * We only want to visit blocks that have been claimed but not yet
143 * replayed; plus, in read-only mode, blocks that are already stable.
145 if (claim_txg
== 0 && spa_writeable(td
->td_spa
))
148 zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
150 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
156 #define TRAVERSE_VISITBP_MAX_DEPTH 20
159 __traverse_visitbp_init(traverse_visitbp_data_t
*tv
,
160 traverse_data_t
*td
, const dnode_phys_t
*dnp
,
161 arc_buf_t
*pbuf
, blkptr_t
*bp
, const zbookmark_t
*zb
, int depth
)
171 tv
->tv_pd
= td
->td_pfd
;
172 tv
->tv_hard
= td
->td_flags
& TRAVERSE_HARD
;
173 tv
->tv_flags
= ARC_WAIT
;
174 tv
->tv_depth
= depth
;
178 __traverse_visitbp(traverse_visitbp_data_t
*tv
)
180 ASSERT3S(tv
->tv_depth
, <, TRAVERSE_VISITBP_MAX_DEPTH
);
182 if (tv
->tv_bp
->blk_birth
== 0) {
183 tv
->tv_err
= tv
->tv_td
->td_func(tv
->tv_td
->td_spa
, NULL
, NULL
,
184 tv
->tv_pbuf
, tv
->tv_zb
, tv
->tv_dnp
, tv
->tv_td
->td_arg
);
188 if (tv
->tv_bp
->blk_birth
<= tv
->tv_td
->td_min_txg
)
191 if (tv
->tv_pd
&& !tv
->tv_pd
->pd_exited
&&
192 ((tv
->tv_pd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
193 BP_GET_TYPE(tv
->tv_bp
) == DMU_OT_DNODE
||
194 BP_GET_LEVEL(tv
->tv_bp
) > 0)) {
195 mutex_enter(&tv
->tv_pd
->pd_mtx
);
196 ASSERT(tv
->tv_pd
->pd_blks_fetched
>= 0);
197 while (tv
->tv_pd
->pd_blks_fetched
== 0 && !tv
->tv_pd
->pd_exited
)
198 cv_wait(&tv
->tv_pd
->pd_cv
, &tv
->tv_pd
->pd_mtx
);
199 tv
->tv_pd
->pd_blks_fetched
--;
200 cv_broadcast(&tv
->tv_pd
->pd_cv
);
201 mutex_exit(&tv
->tv_pd
->pd_mtx
);
204 if (tv
->tv_td
->td_flags
& TRAVERSE_PRE
) {
205 tv
->tv_err
= tv
->tv_td
->td_func(tv
->tv_td
->td_spa
, NULL
,
206 tv
->tv_bp
, tv
->tv_pbuf
, tv
->tv_zb
, tv
->tv_dnp
,
208 if (tv
->tv_err
== TRAVERSE_VISIT_NO_CHILDREN
)
214 if (BP_GET_LEVEL(tv
->tv_bp
) > 0) {
215 tv
->tv_epb
= BP_GET_LSIZE(tv
->tv_bp
) >> SPA_BLKPTRSHIFT
;
217 tv
->tv_err
= dsl_read(NULL
, tv
->tv_td
->td_spa
, tv
->tv_bp
,
218 tv
->tv_pbuf
, arc_getbuf_func
, &tv
->tv_buf
,
219 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
,
220 &tv
->tv_flags
, tv
->tv_zb
);
224 /* recursively visitbp() blocks below this */
225 tv
->tv_cbp
= tv
->tv_buf
->b_data
;
226 for (tv
->tv_i
= 0; tv
->tv_i
< tv
->tv_epb
;
227 tv
->tv_i
++, tv
->tv_cbp
++) {
228 SET_BOOKMARK(&tv
->tv_czb
, tv
->tv_zb
->zb_objset
,
229 tv
->tv_zb
->zb_object
, tv
->tv_zb
->zb_level
- 1,
230 tv
->tv_zb
->zb_blkid
* tv
->tv_epb
+ tv
->tv_i
);
231 __traverse_visitbp_init(tv
+ 1, tv
->tv_td
,
232 tv
->tv_dnp
, tv
->tv_buf
, tv
->tv_cbp
,
233 &tv
->tv_czb
, tv
->tv_depth
+ 1);
234 tv
->tv_err
= __traverse_visitbp(tv
+ 1);
238 tv
->tv_lasterr
= tv
->tv_err
;
241 } else if (BP_GET_TYPE(tv
->tv_bp
) == DMU_OT_DNODE
) {
242 tv
->tv_epb
= BP_GET_LSIZE(tv
->tv_bp
) >> DNODE_SHIFT
;
244 tv
->tv_err
= dsl_read(NULL
, tv
->tv_td
->td_spa
, tv
->tv_bp
,
245 tv
->tv_pbuf
, arc_getbuf_func
, &tv
->tv_buf
,
246 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
,
247 &tv
->tv_flags
, tv
->tv_zb
);
251 /* recursively visitbp() blocks below this */
252 tv
->tv_dnp
= tv
->tv_buf
->b_data
;
253 for (tv
->tv_i
= 0; tv
->tv_i
< tv
->tv_epb
;
254 tv
->tv_i
++, tv
->tv_dnp
++) {
255 tv
->tv_err
= traverse_dnode(tv
->tv_td
, tv
->tv_dnp
,
256 tv
->tv_buf
, tv
->tv_zb
->zb_objset
,
257 tv
->tv_zb
->zb_blkid
* tv
->tv_epb
+ tv
->tv_i
);
261 tv
->tv_lasterr
= tv
->tv_err
;
264 } else if (BP_GET_TYPE(tv
->tv_bp
) == DMU_OT_OBJSET
) {
266 tv
->tv_err
= dsl_read_nolock(NULL
, tv
->tv_td
->td_spa
,
267 tv
->tv_bp
, arc_getbuf_func
, &tv
->tv_buf
,
268 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
,
269 &tv
->tv_flags
, tv
->tv_zb
);
273 tv
->tv_osp
= tv
->tv_buf
->b_data
;
274 tv
->tv_ldnp
= &tv
->tv_osp
->os_meta_dnode
;
275 tv
->tv_err
= traverse_dnode(tv
->tv_td
, tv
->tv_ldnp
, tv
->tv_buf
,
276 tv
->tv_zb
->zb_objset
, DMU_META_DNODE_OBJECT
);
277 if (tv
->tv_err
&& tv
->tv_hard
) {
278 tv
->tv_lasterr
= tv
->tv_err
;
281 if (tv
->tv_err
== 0 &&
282 arc_buf_size(tv
->tv_buf
) >= sizeof (objset_phys_t
)) {
283 tv
->tv_ldnp
= &tv
->tv_osp
->os_userused_dnode
;
284 tv
->tv_err
= traverse_dnode(tv
->tv_td
, tv
->tv_ldnp
,
285 tv
->tv_buf
, tv
->tv_zb
->zb_objset
,
286 DMU_USERUSED_OBJECT
);
288 if (tv
->tv_err
&& tv
->tv_hard
) {
289 tv
->tv_lasterr
= tv
->tv_err
;
292 if (tv
->tv_err
== 0 &&
293 arc_buf_size(tv
->tv_buf
) >= sizeof (objset_phys_t
)) {
294 tv
->tv_ldnp
= &tv
->tv_osp
->os_groupused_dnode
;
295 tv
->tv_err
= traverse_dnode(tv
->tv_td
, tv
->tv_ldnp
,
296 tv
->tv_buf
, tv
->tv_zb
->zb_objset
,
297 DMU_GROUPUSED_OBJECT
);
302 (void) arc_buf_remove_ref(tv
->tv_buf
, &tv
->tv_buf
);
304 if (tv
->tv_err
== 0 && tv
->tv_lasterr
== 0 &&
305 (tv
->tv_td
->td_flags
& TRAVERSE_POST
)) {
306 tv
->tv_err
= tv
->tv_td
->td_func(tv
->tv_td
->td_spa
, NULL
,
307 tv
->tv_bp
, tv
->tv_pbuf
, tv
->tv_zb
, tv
->tv_dnp
,
311 return (tv
->tv_err
!= 0 ? tv
->tv_err
: tv
->tv_lasterr
);
315 * Due to limited stack space recursive functions are frowned upon in
316 * the Linux kernel. However, they often are the most elegant solution
317 * to a problem. The following code preserves the recursive function
318 * traverse_visitbp() but moves the local variables AND function
319 * arguments to the heap to minimize the stack frame size. Enough
320 * space is initially allocated on the stack for 16 levels of recursion.
321 * This change does ugly-up-the-code but it reduces the worst case
322 * usage from roughly 2496 bytes to 576 bytes on x86_64 archs.
325 traverse_visitbp(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
326 arc_buf_t
*pbuf
, blkptr_t
*bp
, const zbookmark_t
*zb
)
328 traverse_visitbp_data_t
*tv
;
331 tv
= kmem_zalloc(sizeof(traverse_visitbp_data_t
) *
332 TRAVERSE_VISITBP_MAX_DEPTH
, KM_SLEEP
);
333 __traverse_visitbp_init(tv
, td
, dnp
, pbuf
, bp
, zb
, 0);
335 error
= __traverse_visitbp(tv
);
337 kmem_free(tv
, sizeof(traverse_visitbp_data_t
) *
338 TRAVERSE_VISITBP_MAX_DEPTH
);
344 traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
345 arc_buf_t
*buf
, uint64_t objset
, uint64_t object
)
347 int j
, err
= 0, lasterr
= 0;
349 boolean_t hard
= (td
->td_flags
& TRAVERSE_HARD
);
351 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
352 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
353 err
= traverse_visitbp(td
, dnp
, buf
,
354 (blkptr_t
*)&dnp
->dn_blkptr
[j
], &czb
);
362 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
363 SET_BOOKMARK(&czb
, objset
,
364 object
, 0, DMU_SPILL_BLKID
);
365 err
= traverse_visitbp(td
, dnp
, buf
,
366 (blkptr_t
*)&dnp
->dn_spill
, &czb
);
373 return (err
!= 0 ? err
: lasterr
);
378 traverse_prefetcher(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
379 arc_buf_t
*pbuf
, const zbookmark_t
*zb
, const dnode_phys_t
*dnp
,
382 prefetch_data_t
*pfd
= arg
;
383 uint32_t aflags
= ARC_NOWAIT
| ARC_PREFETCH
;
385 ASSERT(pfd
->pd_blks_fetched
>= 0);
389 if (bp
== NULL
|| !((pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
390 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0) ||
391 BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
)
394 mutex_enter(&pfd
->pd_mtx
);
395 while (!pfd
->pd_cancel
&& pfd
->pd_blks_fetched
>= pfd
->pd_blks_max
)
396 cv_wait(&pfd
->pd_cv
, &pfd
->pd_mtx
);
397 pfd
->pd_blks_fetched
++;
398 cv_broadcast(&pfd
->pd_cv
);
399 mutex_exit(&pfd
->pd_mtx
);
401 (void) dsl_read(NULL
, spa
, bp
, pbuf
, NULL
, NULL
,
402 ZIO_PRIORITY_ASYNC_READ
,
403 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
,
410 traverse_prefetch_thread(void *arg
)
412 traverse_data_t
*td_main
= arg
;
413 traverse_data_t td
= *td_main
;
416 td
.td_func
= traverse_prefetcher
;
417 td
.td_arg
= td_main
->td_pfd
;
420 SET_BOOKMARK(&czb
, td
.td_objset
,
421 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
422 (void) traverse_visitbp(&td
, NULL
, NULL
, td
.td_rootbp
, &czb
);
424 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
425 td_main
->td_pfd
->pd_exited
= B_TRUE
;
426 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
427 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
431 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
432 * in syncing context).
435 traverse_impl(spa_t
*spa
, dsl_dataset_t
*ds
, blkptr_t
*rootbp
,
436 uint64_t txg_start
, int flags
, blkptr_cb_t func
, void *arg
)
443 td
= kmem_alloc(sizeof(traverse_data_t
), KM_SLEEP
);
444 pd
= kmem_zalloc(sizeof(prefetch_data_t
), KM_SLEEP
);
445 czb
= kmem_alloc(sizeof(zbookmark_t
), KM_SLEEP
);
448 td
->td_objset
= ds
? ds
->ds_object
: 0;
449 td
->td_rootbp
= rootbp
;
450 td
->td_min_txg
= txg_start
;
454 td
->td_flags
= flags
;
456 pd
->pd_blks_max
= zfs_pd_blks_max
;
457 pd
->pd_flags
= flags
;
458 mutex_init(&pd
->pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
459 cv_init(&pd
->pd_cv
, NULL
, CV_DEFAULT
, NULL
);
461 /* See comment on ZIL traversal in dsl_scan_visitds. */
462 if (ds
!= NULL
&& !dsl_dataset_is_snapshot(ds
)) {
465 err
= dmu_objset_from_ds(ds
, &os
);
469 traverse_zil(td
, &os
->os_zil_header
);
472 if (!(flags
& TRAVERSE_PREFETCH
) ||
473 0 == taskq_dispatch(system_taskq
, traverse_prefetch_thread
,
475 pd
->pd_exited
= B_TRUE
;
477 SET_BOOKMARK(czb
, td
->td_objset
,
478 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
479 err
= traverse_visitbp(td
, NULL
, NULL
, rootbp
, czb
);
481 mutex_enter(&pd
->pd_mtx
);
482 pd
->pd_cancel
= B_TRUE
;
483 cv_broadcast(&pd
->pd_cv
);
484 while (!pd
->pd_exited
)
485 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
486 mutex_exit(&pd
->pd_mtx
);
488 mutex_destroy(&pd
->pd_mtx
);
489 cv_destroy(&pd
->pd_cv
);
491 kmem_free(czb
, sizeof(zbookmark_t
));
492 kmem_free(pd
, sizeof(struct prefetch_data
));
493 kmem_free(td
, sizeof(struct traverse_data
));
499 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
500 * in syncing context).
503 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int flags
,
504 blkptr_cb_t func
, void *arg
)
506 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
,
507 &ds
->ds_phys
->ds_bp
, txg_start
, flags
, func
, arg
));
511 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
514 traverse_pool(spa_t
*spa
, uint64_t txg_start
, int flags
,
515 blkptr_cb_t func
, void *arg
)
517 int err
, lasterr
= 0;
519 dsl_pool_t
*dp
= spa_get_dsl(spa
);
520 objset_t
*mos
= dp
->dp_meta_objset
;
521 boolean_t hard
= (flags
& TRAVERSE_HARD
);
524 err
= traverse_impl(spa
, NULL
, spa_get_rootblkptr(spa
),
525 txg_start
, flags
, func
, arg
);
529 /* visit each dataset */
530 for (obj
= 1; err
== 0 || (err
!= ESRCH
&& hard
);
531 err
= dmu_object_next(mos
, &obj
, FALSE
, txg_start
)) {
532 dmu_object_info_t doi
;
534 err
= dmu_object_info(mos
, obj
, &doi
);
542 if (doi
.doi_type
== DMU_OT_DSL_DATASET
) {
544 uint64_t txg
= txg_start
;
546 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
547 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
548 rw_exit(&dp
->dp_config_rwlock
);
555 if (ds
->ds_phys
->ds_prev_snap_txg
> txg
)
556 txg
= ds
->ds_phys
->ds_prev_snap_txg
;
557 err
= traverse_dataset(ds
, txg
, flags
, func
, arg
);
558 dsl_dataset_rele(ds
, FTAG
);
568 return (err
!= 0 ? err
: lasterr
);
571 #if defined(_KERNEL) && defined(HAVE_SPL)
572 EXPORT_SYMBOL(traverse_dataset
);
573 EXPORT_SYMBOL(traverse_pool
);
575 module_param(zfs_pd_blks_max
, int, 0644);
576 MODULE_PARM_DESC(zfs_pd_blks_max
, "Max number of blocks to prefetch");