]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
34dc7c2f BB |
23 | */ |
24 | ||
34dc7c2f BB |
25 | #include <sys/zfs_context.h> |
26 | #include <sys/dmu_objset.h> | |
27 | #include <sys/dmu_traverse.h> | |
28 | #include <sys/dsl_dataset.h> | |
29 | #include <sys/dsl_dir.h> | |
30 | #include <sys/dsl_pool.h> | |
31 | #include <sys/dnode.h> | |
32 | #include <sys/spa.h> | |
33 | #include <sys/zio.h> | |
34 | #include <sys/dmu_impl.h> | |
428870ff BB |
35 | #include <sys/sa.h> |
36 | #include <sys/sa_impl.h> | |
b128c09f BB |
37 | #include <sys/callb.h> |
38 | ||
572e2857 BB |
39 | int zfs_pd_blks_max = 100; |
40 | ||
41 | typedef struct prefetch_data { | |
b128c09f BB |
42 | kmutex_t pd_mtx; |
43 | kcondvar_t pd_cv; | |
44 | int pd_blks_max; | |
45 | int pd_blks_fetched; | |
46 | int pd_flags; | |
47 | boolean_t pd_cancel; | |
48 | boolean_t pd_exited; | |
572e2857 | 49 | } prefetch_data_t; |
b128c09f | 50 | |
572e2857 | 51 | typedef struct traverse_data { |
b128c09f BB |
52 | spa_t *td_spa; |
53 | uint64_t td_objset; | |
54 | blkptr_t *td_rootbp; | |
55 | uint64_t td_min_txg; | |
56 | int td_flags; | |
572e2857 | 57 | prefetch_data_t *td_pfd; |
b128c09f BB |
58 | blkptr_cb_t *td_func; |
59 | void *td_arg; | |
572e2857 | 60 | } traverse_data_t; |
34dc7c2f | 61 | |
6656bf56 BB |
62 | typedef struct traverse_visitbp_data { |
63 | /* Function arguments */ | |
64 | traverse_data_t *tv_td; | |
65 | const dnode_phys_t *tv_dnp; | |
66 | arc_buf_t *tv_pbuf; | |
67 | blkptr_t *tv_bp; | |
68 | const zbookmark_t *tv_zb; | |
69 | /* Local variables */ | |
70 | prefetch_data_t *tv_pd; | |
71 | zbookmark_t tv_czb; | |
72 | arc_buf_t *tv_buf; | |
73 | boolean_t tv_hard; | |
74 | objset_phys_t *tv_osp; | |
75 | dnode_phys_t *tv_ldnp; | |
76 | blkptr_t *tv_cbp; | |
77 | uint32_t tv_flags; | |
78 | int tv_err; | |
79 | int tv_lasterr; | |
80 | int tv_i; | |
81 | int tv_epb; | |
82 | int tv_depth; | |
83 | } traverse_visitbp_data_t; | |
84 | ||
85 | static inline int traverse_visitbp(traverse_data_t *td, const | |
86 | dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb); | |
572e2857 | 87 | static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, |
9babb374 BB |
88 | arc_buf_t *buf, uint64_t objset, uint64_t object); |
89 | ||
428870ff | 90 | static int |
34dc7c2f BB |
91 | traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) |
92 | { | |
572e2857 | 93 | traverse_data_t *td = arg; |
b128c09f | 94 | zbookmark_t zb; |
34dc7c2f | 95 | |
b128c09f | 96 | if (bp->blk_birth == 0) |
428870ff | 97 | return (0); |
34dc7c2f | 98 | |
b128c09f | 99 | if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) |
428870ff BB |
100 | return (0); |
101 | ||
102 | SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, | |
103 | bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); | |
104 | ||
105 | (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); | |
b128c09f | 106 | |
428870ff | 107 | return (0); |
34dc7c2f BB |
108 | } |
109 | ||
428870ff | 110 | static int |
34dc7c2f BB |
111 | traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) |
112 | { | |
572e2857 | 113 | traverse_data_t *td = arg; |
34dc7c2f BB |
114 | |
115 | if (lrc->lrc_txtype == TX_WRITE) { | |
116 | lr_write_t *lr = (lr_write_t *)lrc; | |
117 | blkptr_t *bp = &lr->lr_blkptr; | |
b128c09f | 118 | zbookmark_t zb; |
34dc7c2f | 119 | |
b128c09f | 120 | if (bp->blk_birth == 0) |
428870ff | 121 | return (0); |
34dc7c2f | 122 | |
b128c09f | 123 | if (claim_txg == 0 || bp->blk_birth < claim_txg) |
428870ff | 124 | return (0); |
b128c09f | 125 | |
572e2857 BB |
126 | SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, |
127 | ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); | |
428870ff BB |
128 | |
129 | (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, | |
130 | td->td_arg); | |
34dc7c2f | 131 | } |
428870ff | 132 | return (0); |
34dc7c2f BB |
133 | } |
134 | ||
135 | static void | |
572e2857 | 136 | traverse_zil(traverse_data_t *td, zil_header_t *zh) |
34dc7c2f | 137 | { |
34dc7c2f BB |
138 | uint64_t claim_txg = zh->zh_claim_txg; |
139 | zilog_t *zilog; | |
140 | ||
34dc7c2f BB |
141 | /* |
142 | * We only want to visit blocks that have been claimed but not yet | |
428870ff | 143 | * replayed; plus, in read-only mode, blocks that are already stable. |
34dc7c2f | 144 | */ |
fb5f0bc8 | 145 | if (claim_txg == 0 && spa_writeable(td->td_spa)) |
34dc7c2f BB |
146 | return; |
147 | ||
b128c09f | 148 | zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); |
34dc7c2f | 149 | |
b128c09f | 150 | (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, |
34dc7c2f BB |
151 | claim_txg); |
152 | ||
153 | zil_free(zilog); | |
154 | } | |
155 | ||
6656bf56 BB |
156 | #define TRAVERSE_VISITBP_MAX_DEPTH 20 |
157 | ||
158 | static void | |
159 | __traverse_visitbp_init(traverse_visitbp_data_t *tv, | |
160 | traverse_data_t *td, const dnode_phys_t *dnp, | |
161 | arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb, int depth) | |
34dc7c2f | 162 | { |
6656bf56 BB |
163 | tv->tv_td = td; |
164 | tv->tv_dnp = dnp; | |
165 | tv->tv_pbuf = pbuf; | |
166 | tv->tv_bp = bp; | |
167 | tv->tv_zb = zb; | |
168 | tv->tv_err = 0; | |
169 | tv->tv_lasterr = 0; | |
170 | tv->tv_buf = NULL; | |
171 | tv->tv_pd = td->td_pfd; | |
172 | tv->tv_hard = td->td_flags & TRAVERSE_HARD; | |
173 | tv->tv_flags = ARC_WAIT; | |
174 | tv->tv_depth = depth; | |
175 | } | |
b128c09f | 176 | |
6656bf56 BB |
177 | static noinline int |
178 | __traverse_visitbp(traverse_visitbp_data_t *tv) | |
179 | { | |
180 | ASSERT3S(tv->tv_depth, <, TRAVERSE_VISITBP_MAX_DEPTH); | |
181 | ||
182 | if (tv->tv_bp->blk_birth == 0) { | |
183 | tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL, NULL, | |
184 | tv->tv_pbuf, tv->tv_zb, tv->tv_dnp, tv->tv_td->td_arg); | |
185 | return (tv->tv_err); | |
34dc7c2f BB |
186 | } |
187 | ||
6656bf56 | 188 | if (tv->tv_bp->blk_birth <= tv->tv_td->td_min_txg) |
b128c09f | 189 | return (0); |
34dc7c2f | 190 | |
6656bf56 BB |
191 | if (tv->tv_pd && !tv->tv_pd->pd_exited && |
192 | ((tv->tv_pd->pd_flags & TRAVERSE_PREFETCH_DATA) || | |
193 | BP_GET_TYPE(tv->tv_bp) == DMU_OT_DNODE || | |
194 | BP_GET_LEVEL(tv->tv_bp) > 0)) { | |
195 | mutex_enter(&tv->tv_pd->pd_mtx); | |
196 | ASSERT(tv->tv_pd->pd_blks_fetched >= 0); | |
197 | while (tv->tv_pd->pd_blks_fetched == 0 && !tv->tv_pd->pd_exited) | |
198 | cv_wait(&tv->tv_pd->pd_cv, &tv->tv_pd->pd_mtx); | |
199 | tv->tv_pd->pd_blks_fetched--; | |
200 | cv_broadcast(&tv->tv_pd->pd_cv); | |
201 | mutex_exit(&tv->tv_pd->pd_mtx); | |
34dc7c2f BB |
202 | } |
203 | ||
6656bf56 BB |
204 | if (tv->tv_td->td_flags & TRAVERSE_PRE) { |
205 | tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL, | |
206 | tv->tv_bp, tv->tv_pbuf, tv->tv_zb, tv->tv_dnp, | |
207 | tv->tv_td->td_arg); | |
208 | if (tv->tv_err == TRAVERSE_VISIT_NO_CHILDREN) | |
572e2857 | 209 | return (0); |
6656bf56 BB |
210 | if (tv->tv_err) |
211 | return (tv->tv_err); | |
34dc7c2f BB |
212 | } |
213 | ||
6656bf56 BB |
214 | if (BP_GET_LEVEL(tv->tv_bp) > 0) { |
215 | tv->tv_epb = BP_GET_LSIZE(tv->tv_bp) >> SPA_BLKPTRSHIFT; | |
b128c09f | 216 | |
6656bf56 BB |
217 | tv->tv_err = dsl_read(NULL, tv->tv_td->td_spa, tv->tv_bp, |
218 | tv->tv_pbuf, arc_getbuf_func, &tv->tv_buf, | |
219 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, | |
220 | &tv->tv_flags, tv->tv_zb); | |
221 | if (tv->tv_err) | |
222 | return (tv->tv_err); | |
b128c09f BB |
223 | |
224 | /* recursively visitbp() blocks below this */ | |
6656bf56 BB |
225 | tv->tv_cbp = tv->tv_buf->b_data; |
226 | for (tv->tv_i = 0; tv->tv_i < tv->tv_epb; | |
227 | tv->tv_i++, tv->tv_cbp++) { | |
228 | SET_BOOKMARK(&tv->tv_czb, tv->tv_zb->zb_objset, | |
229 | tv->tv_zb->zb_object, tv->tv_zb->zb_level - 1, | |
230 | tv->tv_zb->zb_blkid * tv->tv_epb + tv->tv_i); | |
231 | __traverse_visitbp_init(tv + 1, tv->tv_td, | |
232 | tv->tv_dnp, tv->tv_buf, tv->tv_cbp, | |
233 | &tv->tv_czb, tv->tv_depth + 1); | |
234 | tv->tv_err = __traverse_visitbp(tv + 1); | |
235 | if (tv->tv_err) { | |
236 | if (!tv->tv_hard) | |
428870ff | 237 | break; |
6656bf56 | 238 | tv->tv_lasterr = tv->tv_err; |
428870ff | 239 | } |
b128c09f | 240 | } |
6656bf56 BB |
241 | } else if (BP_GET_TYPE(tv->tv_bp) == DMU_OT_DNODE) { |
242 | tv->tv_epb = BP_GET_LSIZE(tv->tv_bp) >> DNODE_SHIFT; | |
b128c09f | 243 | |
6656bf56 BB |
244 | tv->tv_err = dsl_read(NULL, tv->tv_td->td_spa, tv->tv_bp, |
245 | tv->tv_pbuf, arc_getbuf_func, &tv->tv_buf, | |
246 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, | |
247 | &tv->tv_flags, tv->tv_zb); | |
248 | if (tv->tv_err) | |
249 | return (tv->tv_err); | |
b128c09f BB |
250 | |
251 | /* recursively visitbp() blocks below this */ | |
6656bf56 BB |
252 | tv->tv_dnp = tv->tv_buf->b_data; |
253 | for (tv->tv_i = 0; tv->tv_i < tv->tv_epb; | |
254 | tv->tv_i++, tv->tv_dnp++) { | |
255 | tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_dnp, | |
256 | tv->tv_buf, tv->tv_zb->zb_objset, | |
257 | tv->tv_zb->zb_blkid * tv->tv_epb + tv->tv_i); | |
258 | if (tv->tv_err) { | |
259 | if (!tv->tv_hard) | |
428870ff | 260 | break; |
6656bf56 | 261 | tv->tv_lasterr = tv->tv_err; |
428870ff | 262 | } |
34dc7c2f | 263 | } |
6656bf56 BB |
264 | } else if (BP_GET_TYPE(tv->tv_bp) == DMU_OT_OBJSET) { |
265 | ||
266 | tv->tv_err = dsl_read_nolock(NULL, tv->tv_td->td_spa, | |
267 | tv->tv_bp, arc_getbuf_func, &tv->tv_buf, | |
268 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, | |
269 | &tv->tv_flags, tv->tv_zb); | |
270 | if (tv->tv_err) | |
271 | return (tv->tv_err); | |
272 | ||
273 | tv->tv_osp = tv->tv_buf->b_data; | |
274 | tv->tv_ldnp = &tv->tv_osp->os_meta_dnode; | |
275 | tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp, tv->tv_buf, | |
276 | tv->tv_zb->zb_objset, DMU_META_DNODE_OBJECT); | |
277 | if (tv->tv_err && tv->tv_hard) { | |
278 | tv->tv_lasterr = tv->tv_err; | |
279 | tv->tv_err = 0; | |
428870ff | 280 | } |
6656bf56 BB |
281 | if (tv->tv_err == 0 && |
282 | arc_buf_size(tv->tv_buf) >= sizeof (objset_phys_t)) { | |
283 | tv->tv_ldnp = &tv->tv_osp->os_userused_dnode; | |
284 | tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp, | |
285 | tv->tv_buf, tv->tv_zb->zb_objset, | |
9babb374 BB |
286 | DMU_USERUSED_OBJECT); |
287 | } | |
6656bf56 BB |
288 | if (tv->tv_err && tv->tv_hard) { |
289 | tv->tv_lasterr = tv->tv_err; | |
290 | tv->tv_err = 0; | |
428870ff | 291 | } |
6656bf56 BB |
292 | if (tv->tv_err == 0 && |
293 | arc_buf_size(tv->tv_buf) >= sizeof (objset_phys_t)) { | |
294 | tv->tv_ldnp = &tv->tv_osp->os_groupused_dnode; | |
295 | tv->tv_err = traverse_dnode(tv->tv_td, tv->tv_ldnp, | |
296 | tv->tv_buf, tv->tv_zb->zb_objset, | |
9babb374 | 297 | DMU_GROUPUSED_OBJECT); |
34dc7c2f | 298 | } |
34dc7c2f BB |
299 | } |
300 | ||
6656bf56 BB |
301 | if (tv->tv_buf) |
302 | (void) arc_buf_remove_ref(tv->tv_buf, &tv->tv_buf); | |
34dc7c2f | 303 | |
6656bf56 BB |
304 | if (tv->tv_err == 0 && tv->tv_lasterr == 0 && |
305 | (tv->tv_td->td_flags & TRAVERSE_POST)) { | |
306 | tv->tv_err = tv->tv_td->td_func(tv->tv_td->td_spa, NULL, | |
307 | tv->tv_bp, tv->tv_pbuf, tv->tv_zb, tv->tv_dnp, | |
308 | tv->tv_td->td_arg); | |
428870ff | 309 | } |
34dc7c2f | 310 | |
6656bf56 BB |
311 | return (tv->tv_err != 0 ? tv->tv_err : tv->tv_lasterr); |
312 | } | |
313 | ||
314 | /* | |
315 | * Due to limited stack space recursive functions are frowned upon in | |
316 | * the Linux kernel. However, they often are the most elegant solution | |
317 | * to a problem. The following code preserves the recursive function | |
318 | * traverse_visitbp() but moves the local variables AND function | |
319 | * arguments to the heap to minimize the stack frame size. Enough | |
320 | * space is initially allocated on the stack for 16 levels of recursion. | |
321 | * This change does ugly-up-the-code but it reduces the worst case | |
322 | * usage from roughly 2496 bytes to 576 bytes on x86_64 archs. | |
323 | */ | |
324 | static int | |
325 | traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, | |
326 | arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) | |
327 | { | |
328 | traverse_visitbp_data_t *tv; | |
329 | int error; | |
330 | ||
331 | tv = kmem_zalloc(sizeof(traverse_visitbp_data_t) * | |
332 | TRAVERSE_VISITBP_MAX_DEPTH, KM_SLEEP); | |
333 | __traverse_visitbp_init(tv, td, dnp, pbuf, bp, zb, 0); | |
334 | ||
335 | error = __traverse_visitbp(tv); | |
336 | ||
337 | kmem_free(tv, sizeof(traverse_visitbp_data_t) * | |
338 | TRAVERSE_VISITBP_MAX_DEPTH); | |
339 | ||
340 | return (error); | |
34dc7c2f BB |
341 | } |
342 | ||
9babb374 | 343 | static int |
572e2857 | 344 | traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, |
9babb374 BB |
345 | arc_buf_t *buf, uint64_t objset, uint64_t object) |
346 | { | |
428870ff | 347 | int j, err = 0, lasterr = 0; |
9babb374 | 348 | zbookmark_t czb; |
428870ff | 349 | boolean_t hard = (td->td_flags & TRAVERSE_HARD); |
9babb374 BB |
350 | |
351 | for (j = 0; j < dnp->dn_nblkptr; j++) { | |
352 | SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); | |
353 | err = traverse_visitbp(td, dnp, buf, | |
354 | (blkptr_t *)&dnp->dn_blkptr[j], &czb); | |
428870ff BB |
355 | if (err) { |
356 | if (!hard) | |
357 | break; | |
358 | lasterr = err; | |
359 | } | |
9babb374 | 360 | } |
428870ff BB |
361 | |
362 | if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { | |
363 | SET_BOOKMARK(&czb, objset, | |
364 | object, 0, DMU_SPILL_BLKID); | |
365 | err = traverse_visitbp(td, dnp, buf, | |
366 | (blkptr_t *)&dnp->dn_spill, &czb); | |
367 | if (err) { | |
368 | if (!hard) | |
369 | return (err); | |
370 | lasterr = err; | |
371 | } | |
372 | } | |
373 | return (err != 0 ? err : lasterr); | |
9babb374 BB |
374 | } |
375 | ||
b128c09f BB |
376 | /* ARGSUSED */ |
377 | static int | |
428870ff BB |
378 | traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, |
379 | arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, | |
380 | void *arg) | |
34dc7c2f | 381 | { |
572e2857 | 382 | prefetch_data_t *pfd = arg; |
b128c09f | 383 | uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; |
34dc7c2f | 384 | |
b128c09f BB |
385 | ASSERT(pfd->pd_blks_fetched >= 0); |
386 | if (pfd->pd_cancel) | |
387 | return (EINTR); | |
34dc7c2f | 388 | |
b128c09f | 389 | if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || |
428870ff BB |
390 | BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || |
391 | BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) | |
34dc7c2f BB |
392 | return (0); |
393 | ||
b128c09f BB |
394 | mutex_enter(&pfd->pd_mtx); |
395 | while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) | |
396 | cv_wait(&pfd->pd_cv, &pfd->pd_mtx); | |
397 | pfd->pd_blks_fetched++; | |
398 | cv_broadcast(&pfd->pd_cv); | |
399 | mutex_exit(&pfd->pd_mtx); | |
34dc7c2f | 400 | |
428870ff | 401 | (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, |
b128c09f BB |
402 | ZIO_PRIORITY_ASYNC_READ, |
403 | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | |
404 | &aflags, zb); | |
34dc7c2f | 405 | |
b128c09f | 406 | return (0); |
34dc7c2f BB |
407 | } |
408 | ||
34dc7c2f | 409 | static void |
b128c09f | 410 | traverse_prefetch_thread(void *arg) |
34dc7c2f | 411 | { |
572e2857 BB |
412 | traverse_data_t *td_main = arg; |
413 | traverse_data_t td = *td_main; | |
b128c09f | 414 | zbookmark_t czb; |
34dc7c2f | 415 | |
b128c09f BB |
416 | td.td_func = traverse_prefetcher; |
417 | td.td_arg = td_main->td_pfd; | |
418 | td.td_pfd = NULL; | |
34dc7c2f | 419 | |
428870ff BB |
420 | SET_BOOKMARK(&czb, td.td_objset, |
421 | ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); | |
b128c09f | 422 | (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); |
34dc7c2f | 423 | |
b128c09f BB |
424 | mutex_enter(&td_main->td_pfd->pd_mtx); |
425 | td_main->td_pfd->pd_exited = B_TRUE; | |
426 | cv_broadcast(&td_main->td_pfd->pd_cv); | |
427 | mutex_exit(&td_main->td_pfd->pd_mtx); | |
34dc7c2f BB |
428 | } |
429 | ||
b128c09f BB |
430 | /* |
431 | * NB: dataset must not be changing on-disk (eg, is a snapshot or we are | |
432 | * in syncing context). | |
433 | */ | |
434 | static int | |
572e2857 | 435 | traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, |
b128c09f | 436 | uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) |
34dc7c2f | 437 | { |
47050a88 BB |
438 | traverse_data_t *td; |
439 | prefetch_data_t *pd; | |
440 | zbookmark_t *czb; | |
b128c09f | 441 | int err; |
34dc7c2f | 442 | |
47050a88 BB |
443 | td = kmem_alloc(sizeof(traverse_data_t), KM_SLEEP); |
444 | pd = kmem_zalloc(sizeof(prefetch_data_t), KM_SLEEP); | |
445 | czb = kmem_alloc(sizeof(zbookmark_t), KM_SLEEP); | |
446 | ||
447 | td->td_spa = spa; | |
448 | td->td_objset = ds ? ds->ds_object : 0; | |
449 | td->td_rootbp = rootbp; | |
450 | td->td_min_txg = txg_start; | |
451 | td->td_func = func; | |
452 | td->td_arg = arg; | |
453 | td->td_pfd = pd; | |
454 | td->td_flags = flags; | |
b128c09f | 455 | |
47050a88 BB |
456 | pd->pd_blks_max = zfs_pd_blks_max; |
457 | pd->pd_flags = flags; | |
458 | mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); | |
459 | cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); | |
b128c09f | 460 | |
572e2857 BB |
461 | /* See comment on ZIL traversal in dsl_scan_visitds. */ |
462 | if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { | |
463 | objset_t *os; | |
464 | ||
465 | err = dmu_objset_from_ds(ds, &os); | |
466 | if (err) | |
467 | return (err); | |
468 | ||
47050a88 | 469 | traverse_zil(td, &os->os_zil_header); |
572e2857 BB |
470 | } |
471 | ||
b128c09f BB |
472 | if (!(flags & TRAVERSE_PREFETCH) || |
473 | 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, | |
47050a88 BB |
474 | td, TQ_NOQUEUE)) |
475 | pd->pd_exited = B_TRUE; | |
b128c09f | 476 | |
47050a88 | 477 | SET_BOOKMARK(czb, td->td_objset, |
428870ff | 478 | ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); |
47050a88 BB |
479 | err = traverse_visitbp(td, NULL, NULL, rootbp, czb); |
480 | ||
481 | mutex_enter(&pd->pd_mtx); | |
482 | pd->pd_cancel = B_TRUE; | |
483 | cv_broadcast(&pd->pd_cv); | |
484 | while (!pd->pd_exited) | |
485 | cv_wait(&pd->pd_cv, &pd->pd_mtx); | |
486 | mutex_exit(&pd->pd_mtx); | |
b128c09f | 487 | |
47050a88 BB |
488 | mutex_destroy(&pd->pd_mtx); |
489 | cv_destroy(&pd->pd_cv); | |
b128c09f | 490 | |
47050a88 BB |
491 | kmem_free(czb, sizeof(zbookmark_t)); |
492 | kmem_free(pd, sizeof(struct prefetch_data)); | |
493 | kmem_free(td, sizeof(struct traverse_data)); | |
34dc7c2f | 494 | |
b128c09f | 495 | return (err); |
34dc7c2f BB |
496 | } |
497 | ||
b128c09f BB |
498 | /* |
499 | * NB: dataset must not be changing on-disk (eg, is a snapshot or we are | |
500 | * in syncing context). | |
501 | */ | |
502 | int | |
503 | traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, | |
504 | blkptr_cb_t func, void *arg) | |
34dc7c2f | 505 | { |
572e2857 | 506 | return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, |
b128c09f | 507 | &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); |
34dc7c2f BB |
508 | } |
509 | ||
b128c09f BB |
510 | /* |
511 | * NB: pool must not be changing on-disk (eg, from zdb or sync context). | |
512 | */ | |
513 | int | |
428870ff BB |
514 | traverse_pool(spa_t *spa, uint64_t txg_start, int flags, |
515 | blkptr_cb_t func, void *arg) | |
34dc7c2f | 516 | { |
428870ff | 517 | int err, lasterr = 0; |
b128c09f BB |
518 | uint64_t obj; |
519 | dsl_pool_t *dp = spa_get_dsl(spa); | |
520 | objset_t *mos = dp->dp_meta_objset; | |
428870ff | 521 | boolean_t hard = (flags & TRAVERSE_HARD); |
b128c09f BB |
522 | |
523 | /* visit the MOS */ | |
572e2857 | 524 | err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), |
428870ff | 525 | txg_start, flags, func, arg); |
b128c09f BB |
526 | if (err) |
527 | return (err); | |
528 | ||
529 | /* visit each dataset */ | |
428870ff BB |
530 | for (obj = 1; err == 0 || (err != ESRCH && hard); |
531 | err = dmu_object_next(mos, &obj, FALSE, txg_start)) { | |
b128c09f BB |
532 | dmu_object_info_t doi; |
533 | ||
534 | err = dmu_object_info(mos, obj, &doi); | |
428870ff BB |
535 | if (err) { |
536 | if (!hard) | |
537 | return (err); | |
538 | lasterr = err; | |
539 | continue; | |
540 | } | |
b128c09f BB |
541 | |
542 | if (doi.doi_type == DMU_OT_DSL_DATASET) { | |
543 | dsl_dataset_t *ds; | |
428870ff BB |
544 | uint64_t txg = txg_start; |
545 | ||
b128c09f BB |
546 | rw_enter(&dp->dp_config_rwlock, RW_READER); |
547 | err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); | |
548 | rw_exit(&dp->dp_config_rwlock); | |
428870ff BB |
549 | if (err) { |
550 | if (!hard) | |
551 | return (err); | |
552 | lasterr = err; | |
553 | continue; | |
554 | } | |
555 | if (ds->ds_phys->ds_prev_snap_txg > txg) | |
556 | txg = ds->ds_phys->ds_prev_snap_txg; | |
557 | err = traverse_dataset(ds, txg, flags, func, arg); | |
b128c09f | 558 | dsl_dataset_rele(ds, FTAG); |
428870ff BB |
559 | if (err) { |
560 | if (!hard) | |
561 | return (err); | |
562 | lasterr = err; | |
563 | } | |
b128c09f | 564 | } |
34dc7c2f | 565 | } |
b128c09f BB |
566 | if (err == ESRCH) |
567 | err = 0; | |
428870ff | 568 | return (err != 0 ? err : lasterr); |
34dc7c2f | 569 | } |
c28b2279 BB |
570 | |
571 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
572 | EXPORT_SYMBOL(traverse_dataset); | |
573 | EXPORT_SYMBOL(traverse_pool); | |
c409e464 BB |
574 | |
575 | module_param(zfs_pd_blks_max, int, 0644); | |
576 | MODULE_PARM_DESC(zfs_pd_blks_max, "Max number of blocks to prefetch"); | |
c28b2279 | 577 | #endif |