]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dmu_traverse.c
Add missing ZFS tunables
[mirror_zfs.git] / module / zfs / dmu_traverse.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/zfs_context.h>
26 #include <sys/dmu_objset.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/dnode.h>
32 #include <sys/spa.h>
33 #include <sys/zio.h>
34 #include <sys/dmu_impl.h>
35 #include <sys/sa.h>
36 #include <sys/sa_impl.h>
37 #include <sys/callb.h>
38
39 int zfs_pd_blks_max = 100;
40
41 typedef struct prefetch_data {
42 kmutex_t pd_mtx;
43 kcondvar_t pd_cv;
44 int pd_blks_max;
45 int pd_blks_fetched;
46 int pd_flags;
47 boolean_t pd_cancel;
48 boolean_t pd_exited;
49 } prefetch_data_t;
50
51 typedef struct traverse_data {
52 spa_t *td_spa;
53 uint64_t td_objset;
54 blkptr_t *td_rootbp;
55 uint64_t td_min_txg;
56 int td_flags;
57 prefetch_data_t *td_pfd;
58 blkptr_cb_t *td_func;
59 void *td_arg;
60 } traverse_data_t;
61
62 typedef struct traverse_visitbp_data {
63 /* Function arguments */
64 traverse_data_t *tv_td;
65 const dnode_phys_t *tv_dnp;
66 arc_buf_t *tv_pbuf;
67 blkptr_t *tv_bp;
68 const zbookmark_t *tv_zb;
69 /* Local variables */
70 prefetch_data_t *tv_pd;
71 zbookmark_t tv_czb;
72 arc_buf_t *tv_buf;
73 boolean_t tv_hard;
74 objset_phys_t *tv_osp;
75 dnode_phys_t *tv_ldnp;
76 blkptr_t *tv_cbp;
77 uint32_t tv_flags;
78 int tv_err;
79 int tv_lasterr;
80 int tv_i;
81 int tv_epb;
82 int tv_depth;
83 } traverse_visitbp_data_t;
84
85 static inline int traverse_visitbp(traverse_data_t *td, const
86 dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb);
87 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
88 arc_buf_t *buf, uint64_t objset, uint64_t object);
89
90 static int
91 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
92 {
93 traverse_data_t *td = arg;
94 zbookmark_t zb;
95
96 if (bp->blk_birth == 0)
97 return (0);
98
99 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
100 return (0);
101
102 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
103 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
104
105 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
106
107 return (0);
108 }
109
110 static int
111 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
112 {
113 traverse_data_t *td = arg;
114
115 if (lrc->lrc_txtype == TX_WRITE) {
116 lr_write_t *lr = (lr_write_t *)lrc;
117 blkptr_t *bp = &lr->lr_blkptr;
118 zbookmark_t zb;
119
120 if (bp->blk_birth == 0)
121 return (0);
122
123 if (claim_txg == 0 || bp->blk_birth < claim_txg)
124 return (0);
125
126 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
127 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
128
129 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
130 td->td_arg);
131 }
132 return (0);
133 }
134
135 static void
136 traverse_zil(traverse_data_t *td, zil_header_t *zh)
137 {
138 uint64_t claim_txg = zh->zh_claim_txg;
139 zilog_t *zilog;
140
141 /*
142 * We only want to visit blocks that have been claimed but not yet
143 * replayed; plus, in read-only mode, blocks that are already stable.
144 */
145 if (claim_txg == 0 && spa_writeable(td->td_spa))
146 return;
147
148 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
149
150 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
151 claim_txg);
152
153 zil_free(zilog);
154 }
155
156 #define TRAVERSE_VISITBP_MAX_DEPTH 20
157
158 static void
159 __traverse_visitbp_init(traverse_visitbp_data_t *tv,
160 traverse_data_t *td, const dnode_phys_t *dnp,
161 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb, int depth)
162 {
163 tv->tv_td = td;
164 tv->tv_dnp = dnp;
165 tv->tv_pbuf = pbuf;
166 tv->tv_bp = bp;
167 tv->tv_zb = zb;
168 tv->tv_err = 0;
169 tv->tv_lasterr = 0;
170 tv->tv_buf = NULL;
171 tv->tv_pd = td->td_pfd;
172 tv->tv_hard = td->td_flags & TRAVERSE_HARD;
173 tv->tv_flags = ARC_WAIT;
174 tv->tv_depth = depth;
175 }
176
177 static noinline int
178 __traverse_visitbp(traverse_visitbp_data_t *tv)
179 {
180 ASSERT3S(tv->tv_depth, <, TRAVERSE_VISITBP_MAX_DEPTH);
181
182 if (tv->tv_bp->blk_birth == 0) {
183 tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL, NULL,
184 tv->tv_pbuf, tv->tv_zb, tv->tv_dnp, tv->tv_td->td_arg);
185 return (tv->tv_err);
186 }
187
188 if (tv->tv_bp->blk_birth <= tv->tv_td->td_min_txg)
189 return (0);
190
191 if (tv->tv_pd && !tv->tv_pd->pd_exited &&
192 ((tv->tv_pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
193 BP_GET_TYPE(tv->tv_bp) == DMU_OT_DNODE ||
194 BP_GET_LEVEL(tv->tv_bp) > 0)) {
195 mutex_enter(&tv->tv_pd->pd_mtx);
196 ASSERT(tv->tv_pd->pd_blks_fetched >= 0);
197 while (tv->tv_pd->pd_blks_fetched == 0 && !tv->tv_pd->pd_exited)
198 cv_wait(&tv->tv_pd->pd_cv, &tv->tv_pd->pd_mtx);
199 tv->tv_pd->pd_blks_fetched--;
200 cv_broadcast(&tv->tv_pd->pd_cv);
201 mutex_exit(&tv->tv_pd->pd_mtx);
202 }
203
204 if (tv->tv_td->td_flags & TRAVERSE_PRE) {
205 tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL,
206 tv->tv_bp, tv->tv_pbuf, tv->tv_zb, tv->tv_dnp,
207 tv->tv_td->td_arg);
208 if (tv->tv_err == TRAVERSE_VISIT_NO_CHILDREN)
209 return (0);
210 if (tv->tv_err)
211 return (tv->tv_err);
212 }
213
214 if (BP_GET_LEVEL(tv->tv_bp) > 0) {
215 tv->tv_epb = BP_GET_LSIZE(tv->tv_bp) >> SPA_BLKPTRSHIFT;
216
217 tv->tv_err = dsl_read(NULL, tv->tv_td->td_spa, tv->tv_bp,
218 tv->tv_pbuf, arc_getbuf_func, &tv->tv_buf,
219 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
220 &tv->tv_flags, tv->tv_zb);
221 if (tv->tv_err)
222 return (tv->tv_err);
223
224 /* recursively visitbp() blocks below this */
225 tv->tv_cbp = tv->tv_buf->b_data;
226 for (tv->tv_i = 0; tv->tv_i < tv->tv_epb;
227 tv->tv_i++, tv->tv_cbp++) {
228 SET_BOOKMARK(&tv->tv_czb, tv->tv_zb->zb_objset,
229 tv->tv_zb->zb_object, tv->tv_zb->zb_level - 1,
230 tv->tv_zb->zb_blkid * tv->tv_epb + tv->tv_i);
231 __traverse_visitbp_init(tv + 1, tv->tv_td,
232 tv->tv_dnp, tv->tv_buf, tv->tv_cbp,
233 &tv->tv_czb, tv->tv_depth + 1);
234 tv->tv_err = __traverse_visitbp(tv + 1);
235 if (tv->tv_err) {
236 if (!tv->tv_hard)
237 break;
238 tv->tv_lasterr = tv->tv_err;
239 }
240 }
241 } else if (BP_GET_TYPE(tv->tv_bp) == DMU_OT_DNODE) {
242 tv->tv_epb = BP_GET_LSIZE(tv->tv_bp) >> DNODE_SHIFT;
243
244 tv->tv_err = dsl_read(NULL, tv->tv_td->td_spa, tv->tv_bp,
245 tv->tv_pbuf, arc_getbuf_func, &tv->tv_buf,
246 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
247 &tv->tv_flags, tv->tv_zb);
248 if (tv->tv_err)
249 return (tv->tv_err);
250
251 /* recursively visitbp() blocks below this */
252 tv->tv_dnp = tv->tv_buf->b_data;
253 for (tv->tv_i = 0; tv->tv_i < tv->tv_epb;
254 tv->tv_i++, tv->tv_dnp++) {
255 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_dnp,
256 tv->tv_buf, tv->tv_zb->zb_objset,
257 tv->tv_zb->zb_blkid * tv->tv_epb + tv->tv_i);
258 if (tv->tv_err) {
259 if (!tv->tv_hard)
260 break;
261 tv->tv_lasterr = tv->tv_err;
262 }
263 }
264 } else if (BP_GET_TYPE(tv->tv_bp) == DMU_OT_OBJSET) {
265
266 tv->tv_err = dsl_read_nolock(NULL, tv->tv_td->td_spa,
267 tv->tv_bp, arc_getbuf_func, &tv->tv_buf,
268 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
269 &tv->tv_flags, tv->tv_zb);
270 if (tv->tv_err)
271 return (tv->tv_err);
272
273 tv->tv_osp = tv->tv_buf->b_data;
274 tv->tv_ldnp = &tv->tv_osp->os_meta_dnode;
275 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp, tv->tv_buf,
276 tv->tv_zb->zb_objset, DMU_META_DNODE_OBJECT);
277 if (tv->tv_err && tv->tv_hard) {
278 tv->tv_lasterr = tv->tv_err;
279 tv->tv_err = 0;
280 }
281 if (tv->tv_err == 0 &&
282 arc_buf_size(tv->tv_buf) >= sizeof (objset_phys_t)) {
283 tv->tv_ldnp = &tv->tv_osp->os_userused_dnode;
284 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp,
285 tv->tv_buf, tv->tv_zb->zb_objset,
286 DMU_USERUSED_OBJECT);
287 }
288 if (tv->tv_err && tv->tv_hard) {
289 tv->tv_lasterr = tv->tv_err;
290 tv->tv_err = 0;
291 }
292 if (tv->tv_err == 0 &&
293 arc_buf_size(tv->tv_buf) >= sizeof (objset_phys_t)) {
294 tv->tv_ldnp = &tv->tv_osp->os_groupused_dnode;
295 tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp,
296 tv->tv_buf, tv->tv_zb->zb_objset,
297 DMU_GROUPUSED_OBJECT);
298 }
299 }
300
301 if (tv->tv_buf)
302 (void) arc_buf_remove_ref(tv->tv_buf, &tv->tv_buf);
303
304 if (tv->tv_err == 0 && tv->tv_lasterr == 0 &&
305 (tv->tv_td->td_flags & TRAVERSE_POST)) {
306 tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL,
307 tv->tv_bp, tv->tv_pbuf, tv->tv_zb, tv->tv_dnp,
308 tv->tv_td->td_arg);
309 }
310
311 return (tv->tv_err != 0 ? tv->tv_err : tv->tv_lasterr);
312 }
313
314 /*
315 * Due to limited stack space recursive functions are frowned upon in
316 * the Linux kernel. However, they often are the most elegant solution
317 * to a problem. The following code preserves the recursive function
318 * traverse_visitbp() but moves the local variables AND function
319 * arguments to the heap to minimize the stack frame size. Enough
320 * space is initially allocated on the stack for 16 levels of recursion.
321 * This change does ugly-up-the-code but it reduces the worst case
322 * usage from roughly 2496 bytes to 576 bytes on x86_64 archs.
323 */
324 static int
325 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
326 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
327 {
328 traverse_visitbp_data_t *tv;
329 int error;
330
331 tv = kmem_zalloc(sizeof(traverse_visitbp_data_t) *
332 TRAVERSE_VISITBP_MAX_DEPTH, KM_SLEEP);
333 __traverse_visitbp_init(tv, td, dnp, pbuf, bp, zb, 0);
334
335 error = __traverse_visitbp(tv);
336
337 kmem_free(tv, sizeof(traverse_visitbp_data_t) *
338 TRAVERSE_VISITBP_MAX_DEPTH);
339
340 return (error);
341 }
342
343 static int
344 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
345 arc_buf_t *buf, uint64_t objset, uint64_t object)
346 {
347 int j, err = 0, lasterr = 0;
348 zbookmark_t czb;
349 boolean_t hard = (td->td_flags & TRAVERSE_HARD);
350
351 for (j = 0; j < dnp->dn_nblkptr; j++) {
352 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
353 err = traverse_visitbp(td, dnp, buf,
354 (blkptr_t *)&dnp->dn_blkptr[j], &czb);
355 if (err) {
356 if (!hard)
357 break;
358 lasterr = err;
359 }
360 }
361
362 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
363 SET_BOOKMARK(&czb, objset,
364 object, 0, DMU_SPILL_BLKID);
365 err = traverse_visitbp(td, dnp, buf,
366 (blkptr_t *)&dnp->dn_spill, &czb);
367 if (err) {
368 if (!hard)
369 return (err);
370 lasterr = err;
371 }
372 }
373 return (err != 0 ? err : lasterr);
374 }
375
376 /* ARGSUSED */
377 static int
378 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
379 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
380 void *arg)
381 {
382 prefetch_data_t *pfd = arg;
383 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
384
385 ASSERT(pfd->pd_blks_fetched >= 0);
386 if (pfd->pd_cancel)
387 return (EINTR);
388
389 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
390 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
391 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
392 return (0);
393
394 mutex_enter(&pfd->pd_mtx);
395 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
396 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
397 pfd->pd_blks_fetched++;
398 cv_broadcast(&pfd->pd_cv);
399 mutex_exit(&pfd->pd_mtx);
400
401 (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
402 ZIO_PRIORITY_ASYNC_READ,
403 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
404 &aflags, zb);
405
406 return (0);
407 }
408
409 static void
410 traverse_prefetch_thread(void *arg)
411 {
412 traverse_data_t *td_main = arg;
413 traverse_data_t td = *td_main;
414 zbookmark_t czb;
415
416 td.td_func = traverse_prefetcher;
417 td.td_arg = td_main->td_pfd;
418 td.td_pfd = NULL;
419
420 SET_BOOKMARK(&czb, td.td_objset,
421 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
422 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
423
424 mutex_enter(&td_main->td_pfd->pd_mtx);
425 td_main->td_pfd->pd_exited = B_TRUE;
426 cv_broadcast(&td_main->td_pfd->pd_cv);
427 mutex_exit(&td_main->td_pfd->pd_mtx);
428 }
429
430 /*
431 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
432 * in syncing context).
433 */
434 static int
435 traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
436 uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
437 {
438 traverse_data_t *td;
439 prefetch_data_t *pd;
440 zbookmark_t *czb;
441 int err;
442
443 td = kmem_alloc(sizeof(traverse_data_t), KM_SLEEP);
444 pd = kmem_zalloc(sizeof(prefetch_data_t), KM_SLEEP);
445 czb = kmem_alloc(sizeof(zbookmark_t), KM_SLEEP);
446
447 td->td_spa = spa;
448 td->td_objset = ds ? ds->ds_object : 0;
449 td->td_rootbp = rootbp;
450 td->td_min_txg = txg_start;
451 td->td_func = func;
452 td->td_arg = arg;
453 td->td_pfd = pd;
454 td->td_flags = flags;
455
456 pd->pd_blks_max = zfs_pd_blks_max;
457 pd->pd_flags = flags;
458 mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
459 cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
460
461 /* See comment on ZIL traversal in dsl_scan_visitds. */
462 if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
463 objset_t *os;
464
465 err = dmu_objset_from_ds(ds, &os);
466 if (err)
467 return (err);
468
469 traverse_zil(td, &os->os_zil_header);
470 }
471
472 if (!(flags & TRAVERSE_PREFETCH) ||
473 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
474 td, TQ_NOQUEUE))
475 pd->pd_exited = B_TRUE;
476
477 SET_BOOKMARK(czb, td->td_objset,
478 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
479 err = traverse_visitbp(td, NULL, NULL, rootbp, czb);
480
481 mutex_enter(&pd->pd_mtx);
482 pd->pd_cancel = B_TRUE;
483 cv_broadcast(&pd->pd_cv);
484 while (!pd->pd_exited)
485 cv_wait(&pd->pd_cv, &pd->pd_mtx);
486 mutex_exit(&pd->pd_mtx);
487
488 mutex_destroy(&pd->pd_mtx);
489 cv_destroy(&pd->pd_cv);
490
491 kmem_free(czb, sizeof(zbookmark_t));
492 kmem_free(pd, sizeof(struct prefetch_data));
493 kmem_free(td, sizeof(struct traverse_data));
494
495 return (err);
496 }
497
498 /*
499 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
500 * in syncing context).
501 */
502 int
503 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
504 blkptr_cb_t func, void *arg)
505 {
506 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
507 &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
508 }
509
510 /*
511 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
512 */
513 int
514 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
515 blkptr_cb_t func, void *arg)
516 {
517 int err, lasterr = 0;
518 uint64_t obj;
519 dsl_pool_t *dp = spa_get_dsl(spa);
520 objset_t *mos = dp->dp_meta_objset;
521 boolean_t hard = (flags & TRAVERSE_HARD);
522
523 /* visit the MOS */
524 err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
525 txg_start, flags, func, arg);
526 if (err)
527 return (err);
528
529 /* visit each dataset */
530 for (obj = 1; err == 0 || (err != ESRCH && hard);
531 err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
532 dmu_object_info_t doi;
533
534 err = dmu_object_info(mos, obj, &doi);
535 if (err) {
536 if (!hard)
537 return (err);
538 lasterr = err;
539 continue;
540 }
541
542 if (doi.doi_type == DMU_OT_DSL_DATASET) {
543 dsl_dataset_t *ds;
544 uint64_t txg = txg_start;
545
546 rw_enter(&dp->dp_config_rwlock, RW_READER);
547 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
548 rw_exit(&dp->dp_config_rwlock);
549 if (err) {
550 if (!hard)
551 return (err);
552 lasterr = err;
553 continue;
554 }
555 if (ds->ds_phys->ds_prev_snap_txg > txg)
556 txg = ds->ds_phys->ds_prev_snap_txg;
557 err = traverse_dataset(ds, txg, flags, func, arg);
558 dsl_dataset_rele(ds, FTAG);
559 if (err) {
560 if (!hard)
561 return (err);
562 lasterr = err;
563 }
564 }
565 }
566 if (err == ESRCH)
567 err = 0;
568 return (err != 0 ? err : lasterr);
569 }
570
571 #if defined(_KERNEL) && defined(HAVE_SPL)
572 EXPORT_SYMBOL(traverse_dataset);
573 EXPORT_SYMBOL(traverse_pool);
574
575 module_param(zfs_pd_blks_max, int, 0644);
576 MODULE_PARM_DESC(zfs_pd_blks_max, "Max number of blocks to prefetch");
577 #endif