]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_traverse.c
Illumos #3464
[mirror_zfs.git] / module / zfs / dmu_traverse.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
9ae529ec 23 * Copyright (c) 2012 by Delphix. All rights reserved.
34dc7c2f
BB
24 */
25
34dc7c2f
BB
26#include <sys/zfs_context.h>
27#include <sys/dmu_objset.h>
28#include <sys/dmu_traverse.h>
29#include <sys/dsl_dataset.h>
30#include <sys/dsl_dir.h>
31#include <sys/dsl_pool.h>
32#include <sys/dnode.h>
33#include <sys/spa.h>
34#include <sys/zio.h>
35#include <sys/dmu_impl.h>
428870ff
BB
36#include <sys/sa.h>
37#include <sys/sa_impl.h>
b128c09f
BB
38#include <sys/callb.h>
39
572e2857
BB
40int zfs_pd_blks_max = 100;
41
42typedef struct prefetch_data {
b128c09f
BB
43 kmutex_t pd_mtx;
44 kcondvar_t pd_cv;
45 int pd_blks_max;
46 int pd_blks_fetched;
47 int pd_flags;
48 boolean_t pd_cancel;
49 boolean_t pd_exited;
572e2857 50} prefetch_data_t;
b128c09f 51
572e2857 52typedef struct traverse_data {
b128c09f
BB
53 spa_t *td_spa;
54 uint64_t td_objset;
55 blkptr_t *td_rootbp;
56 uint64_t td_min_txg;
9ae529ec 57 zbookmark_t *td_resume;
b128c09f 58 int td_flags;
572e2857 59 prefetch_data_t *td_pfd;
b128c09f
BB
60 blkptr_cb_t *td_func;
61 void *td_arg;
572e2857 62} traverse_data_t;
34dc7c2f 63
572e2857 64static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
294f6806 65 uint64_t objset, uint64_t object);
96b89346 66static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
294f6806 67 uint64_t objset, uint64_t object);
9babb374 68
428870ff 69static int
34dc7c2f
BB
70traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
71{
572e2857 72 traverse_data_t *td = arg;
b128c09f 73 zbookmark_t zb;
34dc7c2f 74
b128c09f 75 if (bp->blk_birth == 0)
428870ff 76 return (0);
34dc7c2f 77
b128c09f 78 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
428870ff
BB
79 return (0);
80
81 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
82 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
83
294f6806 84 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
b128c09f 85
428870ff 86 return (0);
34dc7c2f
BB
87}
88
428870ff 89static int
34dc7c2f
BB
90traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
91{
572e2857 92 traverse_data_t *td = arg;
34dc7c2f
BB
93
94 if (lrc->lrc_txtype == TX_WRITE) {
95 lr_write_t *lr = (lr_write_t *)lrc;
96 blkptr_t *bp = &lr->lr_blkptr;
b128c09f 97 zbookmark_t zb;
34dc7c2f 98
b128c09f 99 if (bp->blk_birth == 0)
428870ff 100 return (0);
34dc7c2f 101
b128c09f 102 if (claim_txg == 0 || bp->blk_birth < claim_txg)
428870ff 103 return (0);
b128c09f 104
572e2857
BB
105 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
106 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
428870ff 107
294f6806 108 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
428870ff 109 td->td_arg);
34dc7c2f 110 }
428870ff 111 return (0);
34dc7c2f
BB
112}
113
114static void
572e2857 115traverse_zil(traverse_data_t *td, zil_header_t *zh)
34dc7c2f 116{
34dc7c2f
BB
117 uint64_t claim_txg = zh->zh_claim_txg;
118 zilog_t *zilog;
119
34dc7c2f
BB
120 /*
121 * We only want to visit blocks that have been claimed but not yet
428870ff 122 * replayed; plus, in read-only mode, blocks that are already stable.
34dc7c2f 123 */
fb5f0bc8 124 if (claim_txg == 0 && spa_writeable(td->td_spa))
34dc7c2f
BB
125 return;
126
b128c09f 127 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
34dc7c2f 128
b128c09f 129 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
34dc7c2f
BB
130 claim_txg);
131
132 zil_free(zilog);
133}
134
9ae529ec
CS
135typedef enum resume_skip {
136 RESUME_SKIP_ALL,
137 RESUME_SKIP_NONE,
138 RESUME_SKIP_CHILDREN
139} resume_skip_t;
140
141/*
142 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
143 * the block indicated by zb does not need to be visited at all. Returns
144 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
145 * resume point. This indicates that this block should be visited but not its
146 * children (since they must have been visited in a previous traversal).
147 * Otherwise returns RESUME_SKIP_NONE.
148 */
149static resume_skip_t
150resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
151 const zbookmark_t *zb)
152{
153 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
154 /*
155 * If we already visited this bp & everything below,
156 * don't bother doing it again.
157 */
158 if (zbookmark_is_before(dnp, zb, td->td_resume))
159 return (RESUME_SKIP_ALL);
160
161 /*
162 * If we found the block we're trying to resume from, zero
163 * the bookmark out to indicate that we have resumed.
164 */
165 ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
166 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
167 bzero(td->td_resume, sizeof (*zb));
168 if (td->td_flags & TRAVERSE_POST)
169 return (RESUME_SKIP_CHILDREN);
170 }
171 }
172 return (RESUME_SKIP_NONE);
173}
174
175static void
176traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
177{
178 ASSERT(td->td_resume != NULL);
c99c9001 179 ASSERT0(zb->zb_level);
9ae529ec
CS
180 bcopy(zb, td->td_resume, sizeof (*td->td_resume));
181}
182
96b89346
MA
183static void
184traverse_prefetch_metadata(traverse_data_t *td,
294f6806 185 const blkptr_t *bp, const zbookmark_t *zb)
96b89346
MA
186{
187 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
188
189 if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
190 return;
191 /*
192 * If we are in the process of resuming, don't prefetch, because
193 * some children will not be needed (and in fact may have already
194 * been freed).
195 */
196 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
197 return;
198 if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
199 return;
200 if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
201 return;
202
294f6806
GW
203 (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
204 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
96b89346
MA
205}
206
c7f8f831
BB
207static int
208traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
294f6806 209 const blkptr_t *bp, const zbookmark_t *zb)
6656bf56 210{
c7f8f831
BB
211 zbookmark_t czb;
212 int err = 0, lasterr = 0;
213 arc_buf_t *buf = NULL;
214 prefetch_data_t *pd = td->td_pfd;
215 boolean_t hard = td->td_flags & TRAVERSE_HARD;
9ae529ec
CS
216 boolean_t pause = B_FALSE;
217
218 switch (resume_skip_check(td, dnp, zb)) {
219 case RESUME_SKIP_ALL:
220 return (0);
221 case RESUME_SKIP_CHILDREN:
222 goto post;
223 case RESUME_SKIP_NONE:
224 break;
225 default:
226 ASSERT(0);
227 }
6656bf56 228
9ae529ec 229 if (BP_IS_HOLE(bp)) {
294f6806 230 err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
c7f8f831 231 return (err);
34dc7c2f
BB
232 }
233
c7f8f831 234 if (bp->blk_birth <= td->td_min_txg)
b128c09f 235 return (0);
34dc7c2f 236
c7f8f831
BB
237 if (pd && !pd->pd_exited &&
238 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
239 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
240 mutex_enter(&pd->pd_mtx);
241 ASSERT(pd->pd_blks_fetched >= 0);
242 while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
243 cv_wait(&pd->pd_cv, &pd->pd_mtx);
244 pd->pd_blks_fetched--;
245 cv_broadcast(&pd->pd_cv);
246 mutex_exit(&pd->pd_mtx);
34dc7c2f
BB
247 }
248
c7f8f831 249 if (td->td_flags & TRAVERSE_PRE) {
294f6806 250 err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
c7f8f831
BB
251 td->td_arg);
252 if (err == TRAVERSE_VISIT_NO_CHILDREN)
572e2857 253 return (0);
9ae529ec
CS
254 if (err == ERESTART)
255 pause = B_TRUE; /* handle pausing at a common point */
256 if (err != 0)
257 goto post;
34dc7c2f
BB
258 }
259
c7f8f831
BB
260 if (BP_GET_LEVEL(bp) > 0) {
261 uint32_t flags = ARC_WAIT;
262 int i;
263 blkptr_t *cbp;
264 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
b128c09f 265
294f6806 266 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
c7f8f831 267 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
13fe0198 268 if (err != 0)
c7f8f831 269 return (err);
96b89346
MA
270 cbp = buf->b_data;
271
272 for (i = 0; i < epb; i++) {
273 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
274 zb->zb_level - 1,
275 zb->zb_blkid * epb + i);
294f6806 276 traverse_prefetch_metadata(td, &cbp[i], &czb);
96b89346 277 }
b128c09f
BB
278
279 /* recursively visitbp() blocks below this */
96b89346 280 for (i = 0; i < epb; i++) {
c7f8f831
BB
281 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
282 zb->zb_level - 1,
283 zb->zb_blkid * epb + i);
294f6806 284 err = traverse_visitbp(td, dnp, &cbp[i], &czb);
13fe0198 285 if (err != 0) {
c7f8f831 286 if (!hard)
428870ff 287 break;
c7f8f831 288 lasterr = err;
428870ff 289 }
b128c09f 290 }
c7f8f831
BB
291 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
292 uint32_t flags = ARC_WAIT;
293 int i;
294 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
b128c09f 295
294f6806 296 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
c7f8f831 297 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
13fe0198 298 if (err != 0)
c7f8f831 299 return (err);
96b89346
MA
300 dnp = buf->b_data;
301
302 for (i = 0; i < epb; i++) {
294f6806 303 prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset,
96b89346
MA
304 zb->zb_blkid * epb + i);
305 }
b128c09f
BB
306
307 /* recursively visitbp() blocks below this */
96b89346 308 for (i = 0; i < epb; i++) {
294f6806 309 err = traverse_dnode(td, &dnp[i], zb->zb_objset,
c7f8f831 310 zb->zb_blkid * epb + i);
13fe0198 311 if (err != 0) {
c7f8f831 312 if (!hard)
428870ff 313 break;
c7f8f831 314 lasterr = err;
428870ff 315 }
34dc7c2f 316 }
c7f8f831
BB
317 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
318 uint32_t flags = ARC_WAIT;
319 objset_phys_t *osp;
320 dnode_phys_t *dnp;
321
294f6806 322 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
c7f8f831 323 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
13fe0198 324 if (err != 0)
c7f8f831
BB
325 return (err);
326
327 osp = buf->b_data;
328 dnp = &osp->os_meta_dnode;
294f6806 329 prefetch_dnode_metadata(td, dnp, zb->zb_objset,
96b89346
MA
330 DMU_META_DNODE_OBJECT);
331 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
332 prefetch_dnode_metadata(td, &osp->os_userused_dnode,
294f6806 333 zb->zb_objset, DMU_USERUSED_OBJECT);
96b89346 334 prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
294f6806 335 zb->zb_objset, DMU_USERUSED_OBJECT);
96b89346
MA
336 }
337
294f6806 338 err = traverse_dnode(td, dnp, zb->zb_objset,
c7f8f831
BB
339 DMU_META_DNODE_OBJECT);
340 if (err && hard) {
341 lasterr = err;
342 err = 0;
428870ff 343 }
c7f8f831
BB
344 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
345 dnp = &osp->os_userused_dnode;
294f6806 346 err = traverse_dnode(td, dnp, zb->zb_objset,
9babb374
BB
347 DMU_USERUSED_OBJECT);
348 }
c7f8f831
BB
349 if (err && hard) {
350 lasterr = err;
351 err = 0;
428870ff 352 }
c7f8f831
BB
353 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
354 dnp = &osp->os_groupused_dnode;
294f6806 355 err = traverse_dnode(td, dnp, zb->zb_objset,
9babb374 356 DMU_GROUPUSED_OBJECT);
34dc7c2f 357 }
34dc7c2f
BB
358 }
359
c7f8f831
BB
360 if (buf)
361 (void) arc_buf_remove_ref(buf, &buf);
34dc7c2f 362
9ae529ec 363post:
c7f8f831 364 if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
294f6806 365 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
9ae529ec
CS
366 if (err == ERESTART)
367 pause = B_TRUE;
368 }
369
370 if (pause && td->td_resume != NULL) {
371 ASSERT3U(err, ==, ERESTART);
372 ASSERT(!hard);
373 traverse_pause(td, zb);
428870ff 374 }
34dc7c2f 375
c7f8f831 376 return (err != 0 ? err : lasterr);
34dc7c2f
BB
377}
378
96b89346
MA
379static void
380prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
294f6806 381 uint64_t objset, uint64_t object)
96b89346
MA
382{
383 int j;
384 zbookmark_t czb;
385
386 for (j = 0; j < dnp->dn_nblkptr; j++) {
387 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
294f6806 388 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
96b89346
MA
389 }
390
391 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
392 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
294f6806 393 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
96b89346
MA
394 }
395}
396
9babb374 397static int
572e2857 398traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
294f6806 399 uint64_t objset, uint64_t object)
9babb374 400{
428870ff 401 int j, err = 0, lasterr = 0;
9babb374 402 zbookmark_t czb;
428870ff 403 boolean_t hard = (td->td_flags & TRAVERSE_HARD);
9babb374
BB
404
405 for (j = 0; j < dnp->dn_nblkptr; j++) {
406 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
294f6806 407 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
13fe0198 408 if (err != 0) {
428870ff
BB
409 if (!hard)
410 break;
411 lasterr = err;
412 }
9babb374 413 }
428870ff
BB
414
415 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
96b89346 416 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
294f6806 417 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
13fe0198 418 if (err != 0) {
428870ff
BB
419 if (!hard)
420 return (err);
421 lasterr = err;
422 }
423 }
424 return (err != 0 ? err : lasterr);
9babb374
BB
425}
426
b128c09f
BB
427/* ARGSUSED */
428static int
428870ff 429traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
294f6806 430 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
34dc7c2f 431{
572e2857 432 prefetch_data_t *pfd = arg;
b128c09f 433 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
34dc7c2f 434
b128c09f
BB
435 ASSERT(pfd->pd_blks_fetched >= 0);
436 if (pfd->pd_cancel)
437 return (EINTR);
34dc7c2f 438
b128c09f 439 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
428870ff
BB
440 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
441 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
34dc7c2f
BB
442 return (0);
443
b128c09f
BB
444 mutex_enter(&pfd->pd_mtx);
445 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
446 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
447 pfd->pd_blks_fetched++;
448 cv_broadcast(&pfd->pd_cv);
449 mutex_exit(&pfd->pd_mtx);
34dc7c2f 450
294f6806
GW
451 (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
452 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
34dc7c2f 453
b128c09f 454 return (0);
34dc7c2f
BB
455}
456
34dc7c2f 457static void
b128c09f 458traverse_prefetch_thread(void *arg)
34dc7c2f 459{
572e2857
BB
460 traverse_data_t *td_main = arg;
461 traverse_data_t td = *td_main;
b128c09f 462 zbookmark_t czb;
34dc7c2f 463
b128c09f
BB
464 td.td_func = traverse_prefetcher;
465 td.td_arg = td_main->td_pfd;
466 td.td_pfd = NULL;
34dc7c2f 467
428870ff
BB
468 SET_BOOKMARK(&czb, td.td_objset,
469 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
294f6806 470 (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
34dc7c2f 471
b128c09f
BB
472 mutex_enter(&td_main->td_pfd->pd_mtx);
473 td_main->td_pfd->pd_exited = B_TRUE;
474 cv_broadcast(&td_main->td_pfd->pd_cv);
475 mutex_exit(&td_main->td_pfd->pd_mtx);
34dc7c2f
BB
476}
477
b128c09f
BB
478/*
479 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
480 * in syncing context).
481 */
482static int
9ae529ec
CS
483traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
484 uint64_t txg_start, zbookmark_t *resume, int flags,
485 blkptr_cb_t func, void *arg)
34dc7c2f 486{
47050a88
BB
487 traverse_data_t *td;
488 prefetch_data_t *pd;
489 zbookmark_t *czb;
b128c09f 490 int err;
34dc7c2f 491
9ae529ec
CS
492 ASSERT(ds == NULL || objset == ds->ds_object);
493 ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
494
96b89346
MA
495 /*
496 * The data prefetching mechanism (the prefetch thread) is incompatible
497 * with resuming from a bookmark.
498 */
499 ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
500
b8d06fca
RY
501 td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE);
502 pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE);
503 czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE);
47050a88
BB
504
505 td->td_spa = spa;
9ae529ec 506 td->td_objset = objset;
47050a88
BB
507 td->td_rootbp = rootbp;
508 td->td_min_txg = txg_start;
9ae529ec 509 td->td_resume = resume;
47050a88
BB
510 td->td_func = func;
511 td->td_arg = arg;
512 td->td_pfd = pd;
513 td->td_flags = flags;
b128c09f 514
47050a88
BB
515 pd->pd_blks_max = zfs_pd_blks_max;
516 pd->pd_flags = flags;
517 mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
518 cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
b128c09f 519
572e2857 520 /* See comment on ZIL traversal in dsl_scan_visitds. */
13fe0198
MA
521 if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
522 uint32_t flags = ARC_WAIT;
523 objset_phys_t *osp;
524 arc_buf_t *buf;
572e2857 525
13fe0198
MA
526 err = arc_read(NULL, td->td_spa, rootbp,
527 arc_getbuf_func, &buf,
528 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
529 if (err != 0)
572e2857
BB
530 return (err);
531
13fe0198
MA
532 osp = buf->b_data;
533 traverse_zil(td, &osp->os_zil_header);
534 (void) arc_buf_remove_ref(buf, &buf);
572e2857
BB
535 }
536
96b89346 537 if (!(flags & TRAVERSE_PREFETCH_DATA) ||
b128c09f 538 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
47050a88
BB
539 td, TQ_NOQUEUE))
540 pd->pd_exited = B_TRUE;
b128c09f 541
47050a88 542 SET_BOOKMARK(czb, td->td_objset,
428870ff 543 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
294f6806 544 err = traverse_visitbp(td, NULL, rootbp, czb);
47050a88
BB
545
546 mutex_enter(&pd->pd_mtx);
547 pd->pd_cancel = B_TRUE;
548 cv_broadcast(&pd->pd_cv);
549 while (!pd->pd_exited)
550 cv_wait(&pd->pd_cv, &pd->pd_mtx);
551 mutex_exit(&pd->pd_mtx);
b128c09f 552
47050a88
BB
553 mutex_destroy(&pd->pd_mtx);
554 cv_destroy(&pd->pd_cv);
b128c09f 555
47050a88
BB
556 kmem_free(czb, sizeof(zbookmark_t));
557 kmem_free(pd, sizeof(struct prefetch_data));
558 kmem_free(td, sizeof(struct traverse_data));
34dc7c2f 559
b128c09f 560 return (err);
34dc7c2f
BB
561}
562
b128c09f
BB
563/*
564 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
565 * in syncing context).
566 */
567int
568traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
569 blkptr_cb_t func, void *arg)
34dc7c2f 570{
9ae529ec
CS
571 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
572 &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
573}
574
575int
576traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
577 uint64_t txg_start, zbookmark_t *resume, int flags,
578 blkptr_cb_t func, void *arg)
579{
580 return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
581 blkptr, txg_start, resume, flags, func, arg));
34dc7c2f
BB
582}
583
b128c09f
BB
584/*
585 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
586 */
587int
428870ff
BB
588traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
589 blkptr_cb_t func, void *arg)
34dc7c2f 590{
428870ff 591 int err, lasterr = 0;
b128c09f
BB
592 uint64_t obj;
593 dsl_pool_t *dp = spa_get_dsl(spa);
594 objset_t *mos = dp->dp_meta_objset;
428870ff 595 boolean_t hard = (flags & TRAVERSE_HARD);
b128c09f
BB
596
597 /* visit the MOS */
9ae529ec
CS
598 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
599 txg_start, NULL, flags, func, arg);
13fe0198 600 if (err != 0)
b128c09f
BB
601 return (err);
602
603 /* visit each dataset */
428870ff
BB
604 for (obj = 1; err == 0 || (err != ESRCH && hard);
605 err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
b128c09f
BB
606 dmu_object_info_t doi;
607
608 err = dmu_object_info(mos, obj, &doi);
13fe0198 609 if (err != 0) {
428870ff
BB
610 if (!hard)
611 return (err);
612 lasterr = err;
613 continue;
614 }
b128c09f
BB
615
616 if (doi.doi_type == DMU_OT_DSL_DATASET) {
617 dsl_dataset_t *ds;
428870ff
BB
618 uint64_t txg = txg_start;
619
13fe0198 620 dsl_pool_config_enter(dp, FTAG);
b128c09f 621 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
13fe0198
MA
622 dsl_pool_config_exit(dp, FTAG);
623 if (err != 0) {
428870ff
BB
624 if (!hard)
625 return (err);
626 lasterr = err;
627 continue;
628 }
629 if (ds->ds_phys->ds_prev_snap_txg > txg)
630 txg = ds->ds_phys->ds_prev_snap_txg;
631 err = traverse_dataset(ds, txg, flags, func, arg);
b128c09f 632 dsl_dataset_rele(ds, FTAG);
13fe0198 633 if (err != 0) {
428870ff
BB
634 if (!hard)
635 return (err);
636 lasterr = err;
637 }
b128c09f 638 }
34dc7c2f 639 }
b128c09f
BB
640 if (err == ESRCH)
641 err = 0;
428870ff 642 return (err != 0 ? err : lasterr);
34dc7c2f 643}
c28b2279
BB
644
645#if defined(_KERNEL) && defined(HAVE_SPL)
646EXPORT_SYMBOL(traverse_dataset);
647EXPORT_SYMBOL(traverse_pool);
c409e464
BB
648
649module_param(zfs_pd_blks_max, int, 0644);
650MODULE_PARM_DESC(zfs_pd_blks_max, "Max number of blocks to prefetch");
c28b2279 651#endif