]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dmu_traverse.c
Rebase master to b105
[mirror_zfs.git] / module / zfs / dmu_traverse.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/dmu_objset.h>
28 #include <sys/dmu_traverse.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dnode.h>
33 #include <sys/spa.h>
34 #include <sys/zio.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/callb.h>
37
38 #define SET_BOOKMARK(zb, objset, object, level, blkid) \
39 { \
40 (zb)->zb_objset = objset; \
41 (zb)->zb_object = object; \
42 (zb)->zb_level = level; \
43 (zb)->zb_blkid = blkid; \
44 }
45
46 struct prefetch_data {
47 kmutex_t pd_mtx;
48 kcondvar_t pd_cv;
49 int pd_blks_max;
50 int pd_blks_fetched;
51 int pd_flags;
52 boolean_t pd_cancel;
53 boolean_t pd_exited;
54 };
55
56 struct traverse_data {
57 spa_t *td_spa;
58 uint64_t td_objset;
59 blkptr_t *td_rootbp;
60 uint64_t td_min_txg;
61 int td_flags;
62 struct prefetch_data *td_pfd;
63 blkptr_cb_t *td_func;
64 void *td_arg;
65 };
66
67 /* ARGSUSED */
68 static void
69 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
70 {
71 struct traverse_data *td = arg;
72 zbookmark_t zb;
73
74 if (bp->blk_birth == 0)
75 return;
76
77 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
78 return;
79
80 zb.zb_objset = td->td_objset;
81 zb.zb_object = 0;
82 zb.zb_level = -1;
83 zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
84 VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
85 }
86
87 /* ARGSUSED */
88 static void
89 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
90 {
91 struct traverse_data *td = arg;
92
93 if (lrc->lrc_txtype == TX_WRITE) {
94 lr_write_t *lr = (lr_write_t *)lrc;
95 blkptr_t *bp = &lr->lr_blkptr;
96 zbookmark_t zb;
97
98 if (bp->blk_birth == 0)
99 return;
100
101 if (claim_txg == 0 || bp->blk_birth < claim_txg)
102 return;
103
104 zb.zb_objset = td->td_objset;
105 zb.zb_object = lr->lr_foid;
106 zb.zb_level = BP_GET_LEVEL(bp);
107 zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
108 VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
109 }
110 }
111
112 static void
113 traverse_zil(struct traverse_data *td, zil_header_t *zh)
114 {
115 uint64_t claim_txg = zh->zh_claim_txg;
116 zilog_t *zilog;
117
118 /*
119 * We only want to visit blocks that have been claimed but not yet
120 * replayed (or, in read-only mode, blocks that *would* be claimed).
121 */
122 if (claim_txg == 0 && spa_writeable(td->td_spa))
123 return;
124
125 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
126
127 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
128 claim_txg);
129
130 zil_free(zilog);
131 }
132
133 static int
134 traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
135 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
136 {
137 zbookmark_t czb;
138 int err = 0;
139 arc_buf_t *buf = NULL;
140 struct prefetch_data *pd = td->td_pfd;
141
142 if (bp->blk_birth == 0) {
143 err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
144 return (err);
145 }
146
147 if (bp->blk_birth <= td->td_min_txg)
148 return (0);
149
150 if (pd && !pd->pd_exited &&
151 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
152 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
153 mutex_enter(&pd->pd_mtx);
154 ASSERT(pd->pd_blks_fetched >= 0);
155 while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
156 cv_wait(&pd->pd_cv, &pd->pd_mtx);
157 pd->pd_blks_fetched--;
158 cv_broadcast(&pd->pd_cv);
159 mutex_exit(&pd->pd_mtx);
160 }
161
162 if (td->td_flags & TRAVERSE_PRE) {
163 err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
164 if (err)
165 return (err);
166 }
167
168 if (BP_GET_LEVEL(bp) > 0) {
169 uint32_t flags = ARC_WAIT;
170 int i;
171 blkptr_t *cbp;
172 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
173
174 err = arc_read(NULL, td->td_spa, bp, pbuf,
175 arc_getbuf_func, &buf,
176 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
177 if (err)
178 return (err);
179
180 /* recursively visitbp() blocks below this */
181 cbp = buf->b_data;
182 for (i = 0; i < epb; i++, cbp++) {
183 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
184 zb->zb_level - 1,
185 zb->zb_blkid * epb + i);
186 err = traverse_visitbp(td, dnp, buf, cbp, &czb);
187 if (err)
188 break;
189 }
190 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
191 uint32_t flags = ARC_WAIT;
192 int i, j;
193 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
194
195 err = arc_read(NULL, td->td_spa, bp, pbuf,
196 arc_getbuf_func, &buf,
197 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
198 if (err)
199 return (err);
200
201 /* recursively visitbp() blocks below this */
202 dnp = buf->b_data;
203 for (i = 0; i < epb && err == 0; i++, dnp++) {
204 for (j = 0; j < dnp->dn_nblkptr; j++) {
205 SET_BOOKMARK(&czb, zb->zb_objset,
206 zb->zb_blkid * epb + i,
207 dnp->dn_nlevels - 1, j);
208 err = traverse_visitbp(td, dnp, buf,
209 (blkptr_t *)&dnp->dn_blkptr[j], &czb);
210 if (err)
211 break;
212 }
213 }
214 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
215 uint32_t flags = ARC_WAIT;
216 objset_phys_t *osp;
217 int j;
218
219 err = arc_read_nolock(NULL, td->td_spa, bp,
220 arc_getbuf_func, &buf,
221 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
222 if (err)
223 return (err);
224
225 osp = buf->b_data;
226 /*
227 * traverse_zil is just here for zdb's leak checking.
228 * For other consumers, there will be no ZIL blocks.
229 */
230 traverse_zil(td, &osp->os_zil_header);
231
232 for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
233 SET_BOOKMARK(&czb, zb->zb_objset, 0,
234 osp->os_meta_dnode.dn_nlevels - 1, j);
235 err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
236 (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
237 &czb);
238 if (err)
239 break;
240 }
241 }
242
243 if (buf)
244 (void) arc_buf_remove_ref(buf, &buf);
245
246 if (err == 0 && (td->td_flags & TRAVERSE_POST))
247 err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
248
249 return (err);
250 }
251
252 /* ARGSUSED */
253 static int
254 traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
255 const dnode_phys_t *dnp, void *arg)
256 {
257 struct prefetch_data *pfd = arg;
258 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
259
260 ASSERT(pfd->pd_blks_fetched >= 0);
261 if (pfd->pd_cancel)
262 return (EINTR);
263
264 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
265 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
266 return (0);
267
268 mutex_enter(&pfd->pd_mtx);
269 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
270 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
271 pfd->pd_blks_fetched++;
272 cv_broadcast(&pfd->pd_cv);
273 mutex_exit(&pfd->pd_mtx);
274
275 (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
276 ZIO_PRIORITY_ASYNC_READ,
277 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
278 &aflags, zb);
279
280 return (0);
281 }
282
283 static void
284 traverse_prefetch_thread(void *arg)
285 {
286 struct traverse_data *td_main = arg;
287 struct traverse_data td = *td_main;
288 zbookmark_t czb;
289
290 td.td_func = traverse_prefetcher;
291 td.td_arg = td_main->td_pfd;
292 td.td_pfd = NULL;
293
294 SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
295 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
296
297 mutex_enter(&td_main->td_pfd->pd_mtx);
298 td_main->td_pfd->pd_exited = B_TRUE;
299 cv_broadcast(&td_main->td_pfd->pd_cv);
300 mutex_exit(&td_main->td_pfd->pd_mtx);
301 }
302
303 /*
304 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
305 * in syncing context).
306 */
307 static int
308 traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
309 uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
310 {
311 struct traverse_data td;
312 struct prefetch_data pd = { 0 };
313 zbookmark_t czb;
314 int err;
315
316 td.td_spa = spa;
317 td.td_objset = objset;
318 td.td_rootbp = rootbp;
319 td.td_min_txg = txg_start;
320 td.td_func = func;
321 td.td_arg = arg;
322 td.td_pfd = &pd;
323 td.td_flags = flags;
324
325 pd.pd_blks_max = 100;
326 pd.pd_flags = flags;
327 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
328 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
329
330 if (!(flags & TRAVERSE_PREFETCH) ||
331 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
332 &td, TQ_NOQUEUE))
333 pd.pd_exited = B_TRUE;
334
335 SET_BOOKMARK(&czb, objset, 0, -1, 0);
336 err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
337
338 mutex_enter(&pd.pd_mtx);
339 pd.pd_cancel = B_TRUE;
340 cv_broadcast(&pd.pd_cv);
341 while (!pd.pd_exited)
342 cv_wait(&pd.pd_cv, &pd.pd_mtx);
343 mutex_exit(&pd.pd_mtx);
344
345 mutex_destroy(&pd.pd_mtx);
346 cv_destroy(&pd.pd_cv);
347
348 return (err);
349 }
350
351 /*
352 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
353 * in syncing context).
354 */
355 int
356 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
357 blkptr_cb_t func, void *arg)
358 {
359 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
360 &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
361 }
362
363 /*
364 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
365 */
366 int
367 traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
368 {
369 int err;
370 uint64_t obj;
371 dsl_pool_t *dp = spa_get_dsl(spa);
372 objset_t *mos = dp->dp_meta_objset;
373
374 /* visit the MOS */
375 err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
376 0, TRAVERSE_PRE, func, arg);
377 if (err)
378 return (err);
379
380 /* visit each dataset */
381 for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
382 dmu_object_info_t doi;
383
384 err = dmu_object_info(mos, obj, &doi);
385 if (err)
386 return (err);
387
388 if (doi.doi_type == DMU_OT_DSL_DATASET) {
389 dsl_dataset_t *ds;
390 rw_enter(&dp->dp_config_rwlock, RW_READER);
391 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
392 rw_exit(&dp->dp_config_rwlock);
393 if (err)
394 return (err);
395 err = traverse_dataset(ds,
396 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
397 func, arg);
398 dsl_dataset_rele(ds, FTAG);
399 if (err)
400 return (err);
401 }
402 }
403 if (err == ESRCH)
404 err = 0;
405 return (err);
406 }