]>
Commit | Line | Data |
---|---|---|
b128c09f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
26 | #include <sys/dsl_pool.h> | |
27 | #include <sys/dsl_dataset.h> | |
28 | #include <sys/dsl_prop.h> | |
29 | #include <sys/dsl_dir.h> | |
30 | #include <sys/dsl_synctask.h> | |
31 | #include <sys/dnode.h> | |
32 | #include <sys/dmu_tx.h> | |
33 | #include <sys/dmu_objset.h> | |
34 | #include <sys/arc.h> | |
35 | #include <sys/zap.h> | |
36 | #include <sys/zio.h> | |
37 | #include <sys/zfs_context.h> | |
38 | #include <sys/fs/zfs.h> | |
39 | #include <sys/zfs_znode.h> | |
40 | #include <sys/spa_impl.h> | |
41 | #include <sys/vdev_impl.h> | |
42 | #include <sys/zil_impl.h> | |
43 | ||
44 | typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); | |
45 | ||
46 | static scrub_cb_t dsl_pool_scrub_clean_cb; | |
47 | static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; | |
48 | ||
49 | int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ | |
50 | int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ | |
51 | boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ | |
52 | ||
53 | extern int zfs_txg_timeout; | |
54 | ||
55 | static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { | |
56 | NULL, | |
57 | dsl_pool_scrub_clean_cb | |
58 | }; | |
59 | ||
60 | #define SET_BOOKMARK(zb, objset, object, level, blkid) \ | |
61 | { \ | |
62 | (zb)->zb_objset = objset; \ | |
63 | (zb)->zb_object = object; \ | |
64 | (zb)->zb_level = level; \ | |
65 | (zb)->zb_blkid = blkid; \ | |
66 | } | |
67 | ||
68 | /* ARGSUSED */ | |
69 | static void | |
70 | dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) | |
71 | { | |
72 | dsl_pool_t *dp = arg1; | |
73 | enum scrub_func *funcp = arg2; | |
74 | dmu_object_type_t ot = 0; | |
75 | boolean_t complete = B_FALSE; | |
76 | ||
77 | dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); | |
78 | ||
79 | ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); | |
80 | ASSERT(*funcp > SCRUB_FUNC_NONE); | |
81 | ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); | |
82 | ||
83 | dp->dp_scrub_min_txg = 0; | |
84 | dp->dp_scrub_max_txg = tx->tx_txg; | |
85 | ||
86 | if (*funcp == SCRUB_FUNC_CLEAN) { | |
87 | vdev_t *rvd = dp->dp_spa->spa_root_vdev; | |
88 | ||
89 | /* rewrite all disk labels */ | |
90 | vdev_config_dirty(rvd); | |
91 | ||
92 | if (vdev_resilver_needed(rvd, | |
93 | &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { | |
94 | spa_event_notify(dp->dp_spa, NULL, | |
95 | ESC_ZFS_RESILVER_START); | |
96 | dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, | |
97 | tx->tx_txg); | |
98 | } | |
99 | ||
100 | /* zero out the scrub stats in all vdev_stat_t's */ | |
101 | vdev_scrub_stat_update(rvd, | |
102 | dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : | |
103 | POOL_SCRUB_EVERYTHING, B_FALSE); | |
104 | ||
105 | dp->dp_spa->spa_scrub_started = B_TRUE; | |
106 | } | |
107 | ||
108 | /* back to the generic stuff */ | |
109 | ||
110 | if (dp->dp_blkstats == NULL) { | |
111 | dp->dp_blkstats = | |
112 | kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); | |
113 | } | |
114 | bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); | |
115 | ||
116 | if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) | |
117 | ot = DMU_OT_ZAP_OTHER; | |
118 | ||
119 | dp->dp_scrub_func = *funcp; | |
120 | dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, | |
121 | ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); | |
122 | bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); | |
123 | dp->dp_scrub_restart = B_FALSE; | |
124 | dp->dp_spa->spa_scrub_errors = 0; | |
125 | ||
126 | VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
127 | DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, | |
128 | &dp->dp_scrub_func, tx)); | |
129 | VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
130 | DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, | |
131 | &dp->dp_scrub_queue_obj, tx)); | |
132 | VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
133 | DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, | |
134 | &dp->dp_scrub_min_txg, tx)); | |
135 | VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
136 | DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, | |
137 | &dp->dp_scrub_max_txg, tx)); | |
138 | VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
139 | DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, | |
140 | &dp->dp_scrub_bookmark, tx)); | |
141 | VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
142 | DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, | |
143 | &dp->dp_spa->spa_scrub_errors, tx)); | |
144 | ||
145 | spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, | |
146 | "func=%u mintxg=%llu maxtxg=%llu", | |
147 | *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); | |
148 | } | |
149 | ||
150 | int | |
151 | dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) | |
152 | { | |
153 | return (dsl_sync_task_do(dp, NULL, | |
154 | dsl_pool_scrub_setup_sync, dp, &func, 0)); | |
155 | } | |
156 | ||
157 | /* ARGSUSED */ | |
158 | static void | |
159 | dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) | |
160 | { | |
161 | dsl_pool_t *dp = arg1; | |
162 | boolean_t *completep = arg2; | |
163 | ||
164 | if (dp->dp_scrub_func == SCRUB_FUNC_NONE) | |
165 | return; | |
166 | ||
167 | mutex_enter(&dp->dp_scrub_cancel_lock); | |
168 | ||
169 | if (dp->dp_scrub_restart) { | |
170 | dp->dp_scrub_restart = B_FALSE; | |
171 | *completep = B_FALSE; | |
172 | } | |
173 | ||
174 | /* XXX this is scrub-clean specific */ | |
175 | mutex_enter(&dp->dp_spa->spa_scrub_lock); | |
176 | while (dp->dp_spa->spa_scrub_inflight > 0) { | |
177 | cv_wait(&dp->dp_spa->spa_scrub_io_cv, | |
178 | &dp->dp_spa->spa_scrub_lock); | |
179 | } | |
180 | mutex_exit(&dp->dp_spa->spa_scrub_lock); | |
181 | dp->dp_spa->spa_scrub_started = B_FALSE; | |
182 | dp->dp_spa->spa_scrub_active = B_FALSE; | |
183 | ||
184 | dp->dp_scrub_func = SCRUB_FUNC_NONE; | |
185 | VERIFY(0 == dmu_object_free(dp->dp_meta_objset, | |
186 | dp->dp_scrub_queue_obj, tx)); | |
187 | dp->dp_scrub_queue_obj = 0; | |
188 | bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); | |
189 | ||
190 | VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
191 | DMU_POOL_SCRUB_QUEUE, tx)); | |
192 | VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
193 | DMU_POOL_SCRUB_MIN_TXG, tx)); | |
194 | VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
195 | DMU_POOL_SCRUB_MAX_TXG, tx)); | |
196 | VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
197 | DMU_POOL_SCRUB_BOOKMARK, tx)); | |
198 | VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
199 | DMU_POOL_SCRUB_FUNC, tx)); | |
200 | VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
201 | DMU_POOL_SCRUB_ERRORS, tx)); | |
202 | ||
203 | spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, | |
204 | "complete=%u", *completep); | |
205 | ||
206 | /* below is scrub-clean specific */ | |
207 | vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, | |
208 | *completep); | |
209 | /* | |
210 | * If the scrub/resilver completed, update all DTLs to reflect this. | |
211 | * Whether it succeeded or not, vacate all temporary scrub DTLs. | |
212 | */ | |
213 | vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, | |
214 | *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); | |
215 | if (dp->dp_scrub_min_txg && *completep) | |
216 | spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); | |
217 | spa_errlog_rotate(dp->dp_spa); | |
218 | ||
219 | /* | |
220 | * We may have finished replacing a device. | |
221 | * Let the async thread assess this and handle the detach. | |
222 | */ | |
223 | spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); | |
224 | ||
225 | dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; | |
226 | mutex_exit(&dp->dp_scrub_cancel_lock); | |
227 | } | |
228 | ||
229 | int | |
230 | dsl_pool_scrub_cancel(dsl_pool_t *dp) | |
231 | { | |
232 | boolean_t complete = B_FALSE; | |
233 | ||
234 | return (dsl_sync_task_do(dp, NULL, | |
235 | dsl_pool_scrub_cancel_sync, dp, &complete, 3)); | |
236 | } | |
237 | ||
238 | int | |
239 | dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, | |
240 | zio_done_func_t *done, void *private, uint32_t arc_flags) | |
241 | { | |
242 | /* | |
243 | * This function will be used by bp-rewrite wad to intercept frees. | |
244 | */ | |
245 | return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, | |
246 | done, private, arc_flags)); | |
247 | } | |
248 | ||
249 | static boolean_t | |
250 | bookmark_is_zero(const zbookmark_t *zb) | |
251 | { | |
252 | return (zb->zb_objset == 0 && zb->zb_object == 0 && | |
253 | zb->zb_level == 0 && zb->zb_blkid == 0); | |
254 | } | |
255 | ||
256 | /* dnp is the dnode for zb1->zb_object */ | |
257 | static boolean_t | |
258 | bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, | |
259 | const zbookmark_t *zb2) | |
260 | { | |
261 | uint64_t zb1nextL0, zb2thisobj; | |
262 | ||
263 | ASSERT(zb1->zb_objset == zb2->zb_objset); | |
264 | ASSERT(zb1->zb_object != -1ULL); | |
265 | ASSERT(zb2->zb_level == 0); | |
266 | ||
267 | /* | |
268 | * A bookmark in the deadlist is considered to be after | |
269 | * everything else. | |
270 | */ | |
271 | if (zb2->zb_object == -1ULL) | |
272 | return (B_TRUE); | |
273 | ||
274 | /* The objset_phys_t isn't before anything. */ | |
275 | if (dnp == NULL) | |
276 | return (B_FALSE); | |
277 | ||
278 | zb1nextL0 = (zb1->zb_blkid + 1) << | |
279 | ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); | |
280 | ||
281 | zb2thisobj = zb2->zb_object ? zb2->zb_object : | |
282 | zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); | |
283 | ||
284 | if (zb1->zb_object == 0) { | |
285 | uint64_t nextobj = zb1nextL0 * | |
286 | (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; | |
287 | return (nextobj <= zb2thisobj); | |
288 | } | |
289 | ||
290 | if (zb1->zb_object < zb2thisobj) | |
291 | return (B_TRUE); | |
292 | if (zb1->zb_object > zb2thisobj) | |
293 | return (B_FALSE); | |
294 | if (zb2->zb_object == 0) | |
295 | return (B_FALSE); | |
296 | return (zb1nextL0 <= zb2->zb_blkid); | |
297 | } | |
298 | ||
299 | static boolean_t | |
300 | scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) | |
301 | { | |
302 | int elapsed_ticks; | |
303 | int mintime; | |
304 | ||
305 | if (dp->dp_scrub_pausing) | |
306 | return (B_TRUE); /* we're already pausing */ | |
307 | ||
308 | if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) | |
309 | return (B_FALSE); /* we're resuming */ | |
310 | ||
311 | /* We only know how to resume from level-0 blocks. */ | |
312 | if (zb->zb_level != 0) | |
313 | return (B_FALSE); | |
314 | ||
315 | mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : | |
316 | zfs_scrub_min_time; | |
317 | elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; | |
318 | if (elapsed_ticks > hz * zfs_txg_timeout || | |
319 | (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { | |
320 | dprintf("pausing at %llx/%llx/%llx/%llx\n", | |
321 | (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, | |
322 | (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); | |
323 | dp->dp_scrub_pausing = B_TRUE; | |
324 | dp->dp_scrub_bookmark = *zb; | |
325 | return (B_TRUE); | |
326 | } | |
327 | return (B_FALSE); | |
328 | } | |
329 | ||
330 | typedef struct zil_traverse_arg { | |
331 | dsl_pool_t *zta_dp; | |
332 | zil_header_t *zta_zh; | |
333 | } zil_traverse_arg_t; | |
334 | ||
335 | /* ARGSUSED */ | |
336 | static void | |
337 | traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) | |
338 | { | |
339 | zil_traverse_arg_t *zta = arg; | |
340 | dsl_pool_t *dp = zta->zta_dp; | |
341 | zil_header_t *zh = zta->zta_zh; | |
342 | zbookmark_t zb; | |
343 | ||
344 | if (bp->blk_birth <= dp->dp_scrub_min_txg) | |
345 | return; | |
346 | ||
347 | if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) | |
348 | return; | |
349 | ||
350 | zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; | |
351 | zb.zb_object = 0; | |
352 | zb.zb_level = -1; | |
353 | zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; | |
354 | VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); | |
355 | } | |
356 | ||
357 | /* ARGSUSED */ | |
358 | static void | |
359 | traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) | |
360 | { | |
361 | if (lrc->lrc_txtype == TX_WRITE) { | |
362 | zil_traverse_arg_t *zta = arg; | |
363 | dsl_pool_t *dp = zta->zta_dp; | |
364 | zil_header_t *zh = zta->zta_zh; | |
365 | lr_write_t *lr = (lr_write_t *)lrc; | |
366 | blkptr_t *bp = &lr->lr_blkptr; | |
367 | zbookmark_t zb; | |
368 | ||
369 | if (bp->blk_birth <= dp->dp_scrub_min_txg) | |
370 | return; | |
371 | ||
372 | if (claim_txg == 0 || bp->blk_birth < claim_txg) | |
373 | return; | |
374 | ||
375 | zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; | |
376 | zb.zb_object = lr->lr_foid; | |
377 | zb.zb_level = BP_GET_LEVEL(bp); | |
378 | zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); | |
379 | VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); | |
380 | } | |
381 | } | |
382 | ||
383 | static void | |
384 | traverse_zil(dsl_pool_t *dp, zil_header_t *zh) | |
385 | { | |
386 | uint64_t claim_txg = zh->zh_claim_txg; | |
387 | zil_traverse_arg_t zta = { dp, zh }; | |
388 | zilog_t *zilog; | |
389 | ||
390 | /* | |
391 | * We only want to visit blocks that have been claimed but not yet | |
392 | * replayed (or, in read-only mode, blocks that *would* be claimed). | |
393 | */ | |
fb5f0bc8 | 394 | if (claim_txg == 0 && spa_writeable(dp->dp_spa)) |
b128c09f BB |
395 | return; |
396 | ||
397 | zilog = zil_alloc(dp->dp_meta_objset, zh); | |
398 | ||
399 | (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, | |
400 | claim_txg); | |
401 | ||
402 | zil_free(zilog); | |
403 | } | |
404 | ||
405 | static void | |
406 | scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, | |
407 | arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) | |
408 | { | |
409 | int err; | |
410 | arc_buf_t *buf = NULL; | |
411 | ||
b128c09f BB |
412 | if (bp->blk_birth <= dp->dp_scrub_min_txg) |
413 | return; | |
414 | ||
415 | if (scrub_pause(dp, zb)) | |
416 | return; | |
417 | ||
418 | if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { | |
419 | /* | |
420 | * If we already visited this bp & everything below (in | |
421 | * a prior txg), don't bother doing it again. | |
422 | */ | |
423 | if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) | |
424 | return; | |
425 | ||
426 | /* | |
427 | * If we found the block we're trying to resume from, or | |
428 | * we went past it to a different object, zero it out to | |
429 | * indicate that it's OK to start checking for pausing | |
430 | * again. | |
431 | */ | |
432 | if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || | |
433 | zb->zb_object > dp->dp_scrub_bookmark.zb_object) { | |
434 | dprintf("resuming at %llx/%llx/%llx/%llx\n", | |
435 | (longlong_t)zb->zb_objset, | |
436 | (longlong_t)zb->zb_object, | |
437 | (longlong_t)zb->zb_level, | |
438 | (longlong_t)zb->zb_blkid); | |
439 | bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); | |
440 | } | |
441 | } | |
442 | ||
443 | if (BP_GET_LEVEL(bp) > 0) { | |
444 | uint32_t flags = ARC_WAIT; | |
445 | int i; | |
446 | blkptr_t *cbp; | |
447 | int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; | |
448 | ||
449 | err = arc_read(NULL, dp->dp_spa, bp, pbuf, | |
450 | arc_getbuf_func, &buf, | |
451 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); | |
452 | if (err) { | |
453 | mutex_enter(&dp->dp_spa->spa_scrub_lock); | |
454 | dp->dp_spa->spa_scrub_errors++; | |
455 | mutex_exit(&dp->dp_spa->spa_scrub_lock); | |
456 | return; | |
457 | } | |
458 | cbp = buf->b_data; | |
459 | ||
460 | for (i = 0; i < epb; i++, cbp++) { | |
461 | zbookmark_t czb; | |
462 | ||
463 | SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, | |
464 | zb->zb_level - 1, | |
465 | zb->zb_blkid * epb + i); | |
466 | scrub_visitbp(dp, dnp, buf, cbp, &czb); | |
467 | } | |
468 | } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { | |
469 | uint32_t flags = ARC_WAIT; | |
470 | dnode_phys_t *child_dnp; | |
471 | int i, j; | |
472 | int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; | |
473 | ||
474 | err = arc_read(NULL, dp->dp_spa, bp, pbuf, | |
475 | arc_getbuf_func, &buf, | |
476 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); | |
477 | if (err) { | |
478 | mutex_enter(&dp->dp_spa->spa_scrub_lock); | |
479 | dp->dp_spa->spa_scrub_errors++; | |
480 | mutex_exit(&dp->dp_spa->spa_scrub_lock); | |
481 | return; | |
482 | } | |
483 | child_dnp = buf->b_data; | |
484 | ||
485 | for (i = 0; i < epb; i++, child_dnp++) { | |
486 | for (j = 0; j < child_dnp->dn_nblkptr; j++) { | |
487 | zbookmark_t czb; | |
488 | ||
489 | SET_BOOKMARK(&czb, zb->zb_objset, | |
490 | zb->zb_blkid * epb + i, | |
491 | child_dnp->dn_nlevels - 1, j); | |
492 | scrub_visitbp(dp, child_dnp, buf, | |
493 | &child_dnp->dn_blkptr[j], &czb); | |
494 | } | |
495 | } | |
496 | } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { | |
497 | uint32_t flags = ARC_WAIT; | |
498 | objset_phys_t *osp; | |
499 | int j; | |
500 | ||
501 | err = arc_read_nolock(NULL, dp->dp_spa, bp, | |
502 | arc_getbuf_func, &buf, | |
503 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); | |
504 | if (err) { | |
505 | mutex_enter(&dp->dp_spa->spa_scrub_lock); | |
506 | dp->dp_spa->spa_scrub_errors++; | |
507 | mutex_exit(&dp->dp_spa->spa_scrub_lock); | |
508 | return; | |
509 | } | |
510 | ||
511 | osp = buf->b_data; | |
512 | ||
513 | traverse_zil(dp, &osp->os_zil_header); | |
514 | ||
515 | for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { | |
516 | zbookmark_t czb; | |
517 | ||
518 | SET_BOOKMARK(&czb, zb->zb_objset, 0, | |
519 | osp->os_meta_dnode.dn_nlevels - 1, j); | |
520 | scrub_visitbp(dp, &osp->os_meta_dnode, buf, | |
521 | &osp->os_meta_dnode.dn_blkptr[j], &czb); | |
522 | } | |
523 | } | |
524 | ||
525 | (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); | |
526 | if (buf) | |
527 | (void) arc_buf_remove_ref(buf, &buf); | |
528 | } | |
529 | ||
530 | static void | |
531 | scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) | |
532 | { | |
533 | zbookmark_t zb; | |
534 | ||
535 | SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); | |
536 | scrub_visitbp(dp, NULL, NULL, bp, &zb); | |
537 | } | |
538 | ||
539 | void | |
540 | dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) | |
541 | { | |
542 | dsl_pool_t *dp = ds->ds_dir->dd_pool; | |
543 | ||
544 | if (dp->dp_scrub_func == SCRUB_FUNC_NONE) | |
545 | return; | |
546 | ||
547 | if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { | |
548 | SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); | |
549 | } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
550 | ds->ds_object, tx) != 0) { | |
551 | return; | |
552 | } | |
553 | ||
554 | if (ds->ds_phys->ds_next_snap_obj != 0) { | |
555 | VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
556 | ds->ds_phys->ds_next_snap_obj, tx) == 0); | |
557 | } | |
558 | ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); | |
559 | } | |
560 | ||
561 | void | |
562 | dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) | |
563 | { | |
564 | dsl_pool_t *dp = ds->ds_dir->dd_pool; | |
565 | ||
566 | if (dp->dp_scrub_func == SCRUB_FUNC_NONE) | |
567 | return; | |
568 | ||
569 | ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); | |
570 | ||
571 | if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { | |
572 | dp->dp_scrub_bookmark.zb_objset = | |
573 | ds->ds_phys->ds_prev_snap_obj; | |
574 | } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
575 | ds->ds_object, tx) == 0) { | |
576 | VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
577 | ds->ds_phys->ds_prev_snap_obj, tx) == 0); | |
578 | } | |
579 | } | |
580 | ||
581 | void | |
582 | dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) | |
583 | { | |
584 | dsl_pool_t *dp = ds1->ds_dir->dd_pool; | |
585 | ||
586 | if (dp->dp_scrub_func == SCRUB_FUNC_NONE) | |
587 | return; | |
588 | ||
589 | if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { | |
590 | dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; | |
591 | } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { | |
592 | dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; | |
593 | } | |
594 | ||
595 | if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
596 | ds1->ds_object, tx) == 0) { | |
597 | int err = zap_add_int(dp->dp_meta_objset, | |
598 | dp->dp_scrub_queue_obj, ds2->ds_object, tx); | |
599 | VERIFY(err == 0 || err == EEXIST); | |
600 | if (err == EEXIST) { | |
601 | /* Both were there to begin with */ | |
602 | VERIFY(0 == zap_add_int(dp->dp_meta_objset, | |
603 | dp->dp_scrub_queue_obj, ds1->ds_object, tx)); | |
604 | } | |
605 | } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
606 | ds2->ds_object, tx) == 0) { | |
607 | VERIFY(0 == zap_add_int(dp->dp_meta_objset, | |
608 | dp->dp_scrub_queue_obj, ds1->ds_object, tx)); | |
609 | } | |
610 | } | |
611 | ||
612 | struct enqueue_clones_arg { | |
613 | dmu_tx_t *tx; | |
614 | uint64_t originobj; | |
615 | }; | |
616 | ||
617 | /* ARGSUSED */ | |
618 | static int | |
619 | enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) | |
620 | { | |
621 | struct enqueue_clones_arg *eca = arg; | |
622 | dsl_dataset_t *ds; | |
623 | int err; | |
624 | dsl_pool_t *dp; | |
625 | ||
626 | err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); | |
627 | if (err) | |
628 | return (err); | |
629 | dp = ds->ds_dir->dd_pool; | |
630 | ||
631 | if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { | |
632 | while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { | |
633 | dsl_dataset_t *prev; | |
634 | err = dsl_dataset_hold_obj(dp, | |
635 | ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); | |
636 | ||
637 | dsl_dataset_rele(ds, FTAG); | |
638 | if (err) | |
639 | return (err); | |
640 | ds = prev; | |
641 | } | |
642 | VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
643 | ds->ds_object, eca->tx) == 0); | |
644 | } | |
645 | dsl_dataset_rele(ds, FTAG); | |
646 | return (0); | |
647 | } | |
648 | ||
649 | static void | |
650 | scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) | |
651 | { | |
652 | dsl_dataset_t *ds; | |
653 | uint64_t min_txg_save; | |
654 | ||
655 | VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); | |
656 | ||
657 | /* | |
658 | * Iterate over the bps in this ds. | |
659 | */ | |
660 | min_txg_save = dp->dp_scrub_min_txg; | |
661 | dp->dp_scrub_min_txg = | |
662 | MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); | |
663 | scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); | |
664 | dp->dp_scrub_min_txg = min_txg_save; | |
665 | ||
666 | if (dp->dp_scrub_pausing) | |
667 | goto out; | |
668 | ||
669 | /* | |
670 | * Add descendent datasets to work queue. | |
671 | */ | |
672 | if (ds->ds_phys->ds_next_snap_obj != 0) { | |
673 | VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
674 | ds->ds_phys->ds_next_snap_obj, tx) == 0); | |
675 | } | |
676 | if (ds->ds_phys->ds_num_children > 1) { | |
677 | if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { | |
678 | struct enqueue_clones_arg eca; | |
679 | eca.tx = tx; | |
680 | eca.originobj = ds->ds_object; | |
681 | ||
682 | (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, | |
683 | NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); | |
684 | } else { | |
685 | VERIFY(zap_join(dp->dp_meta_objset, | |
686 | ds->ds_phys->ds_next_clones_obj, | |
687 | dp->dp_scrub_queue_obj, tx) == 0); | |
688 | } | |
689 | } | |
690 | ||
691 | out: | |
692 | dsl_dataset_rele(ds, FTAG); | |
693 | } | |
694 | ||
695 | /* ARGSUSED */ | |
696 | static int | |
697 | enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) | |
698 | { | |
699 | dmu_tx_t *tx = arg; | |
700 | dsl_dataset_t *ds; | |
701 | int err; | |
702 | dsl_pool_t *dp; | |
703 | ||
704 | err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); | |
705 | if (err) | |
706 | return (err); | |
707 | ||
708 | dp = ds->ds_dir->dd_pool; | |
709 | ||
710 | while (ds->ds_phys->ds_prev_snap_obj != 0) { | |
711 | dsl_dataset_t *prev; | |
712 | err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, | |
713 | FTAG, &prev); | |
714 | if (err) { | |
715 | dsl_dataset_rele(ds, FTAG); | |
716 | return (err); | |
717 | } | |
718 | ||
719 | /* | |
720 | * If this is a clone, we don't need to worry about it for now. | |
721 | */ | |
722 | if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { | |
723 | dsl_dataset_rele(ds, FTAG); | |
724 | dsl_dataset_rele(prev, FTAG); | |
725 | return (0); | |
726 | } | |
727 | dsl_dataset_rele(ds, FTAG); | |
728 | ds = prev; | |
729 | } | |
730 | ||
731 | VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, | |
732 | ds->ds_object, tx) == 0); | |
733 | dsl_dataset_rele(ds, FTAG); | |
734 | return (0); | |
735 | } | |
736 | ||
737 | void | |
738 | dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) | |
739 | { | |
fb5f0bc8 | 740 | spa_t *spa = dp->dp_spa; |
b128c09f BB |
741 | zap_cursor_t zc; |
742 | zap_attribute_t za; | |
743 | boolean_t complete = B_TRUE; | |
744 | ||
745 | if (dp->dp_scrub_func == SCRUB_FUNC_NONE) | |
746 | return; | |
747 | ||
fb5f0bc8 BB |
748 | /* |
749 | * If the pool is not loaded, or is trying to unload, leave it alone. | |
750 | */ | |
751 | if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa)) | |
b128c09f BB |
752 | return; |
753 | ||
754 | if (dp->dp_scrub_restart) { | |
755 | enum scrub_func func = dp->dp_scrub_func; | |
756 | dp->dp_scrub_restart = B_FALSE; | |
757 | dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); | |
758 | } | |
759 | ||
fb5f0bc8 | 760 | if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { |
b128c09f BB |
761 | /* |
762 | * We must have resumed after rebooting; reset the vdev | |
763 | * stats to know that we're doing a scrub (although it | |
764 | * will think we're just starting now). | |
765 | */ | |
fb5f0bc8 | 766 | vdev_scrub_stat_update(spa->spa_root_vdev, |
b128c09f BB |
767 | dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : |
768 | POOL_SCRUB_EVERYTHING, B_FALSE); | |
769 | } | |
770 | ||
771 | dp->dp_scrub_pausing = B_FALSE; | |
772 | dp->dp_scrub_start_time = lbolt64; | |
773 | dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); | |
fb5f0bc8 | 774 | spa->spa_scrub_active = B_TRUE; |
b128c09f BB |
775 | |
776 | if (dp->dp_scrub_bookmark.zb_objset == 0) { | |
777 | /* First do the MOS & ORIGIN */ | |
778 | scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); | |
779 | if (dp->dp_scrub_pausing) | |
780 | goto out; | |
781 | ||
fb5f0bc8 BB |
782 | if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { |
783 | VERIFY(0 == dmu_objset_find_spa(spa, | |
b128c09f BB |
784 | NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); |
785 | } else { | |
786 | scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); | |
787 | } | |
788 | ASSERT(!dp->dp_scrub_pausing); | |
789 | } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { | |
790 | /* | |
791 | * If we were paused, continue from here. Note if the | |
792 | * ds we were paused on was deleted, the zb_objset will | |
793 | * be -1, so we will skip this and find a new objset | |
794 | * below. | |
795 | */ | |
796 | scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); | |
797 | if (dp->dp_scrub_pausing) | |
798 | goto out; | |
799 | } | |
800 | ||
801 | /* | |
802 | * In case we were paused right at the end of the ds, zero the | |
803 | * bookmark so we don't think that we're still trying to resume. | |
804 | */ | |
805 | bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); | |
806 | ||
807 | /* keep pulling things out of the zap-object-as-queue */ | |
808 | while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), | |
809 | zap_cursor_retrieve(&zc, &za) == 0) { | |
810 | VERIFY(0 == zap_remove(dp->dp_meta_objset, | |
811 | dp->dp_scrub_queue_obj, za.za_name, tx)); | |
812 | scrub_visitds(dp, za.za_first_integer, tx); | |
813 | if (dp->dp_scrub_pausing) | |
814 | break; | |
815 | zap_cursor_fini(&zc); | |
816 | } | |
817 | zap_cursor_fini(&zc); | |
818 | if (dp->dp_scrub_pausing) | |
819 | goto out; | |
820 | ||
821 | /* done. */ | |
822 | ||
823 | dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); | |
824 | return; | |
825 | out: | |
826 | VERIFY(0 == zap_update(dp->dp_meta_objset, | |
827 | DMU_POOL_DIRECTORY_OBJECT, | |
828 | DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, | |
829 | &dp->dp_scrub_bookmark, tx)); | |
830 | VERIFY(0 == zap_update(dp->dp_meta_objset, | |
831 | DMU_POOL_DIRECTORY_OBJECT, | |
832 | DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, | |
fb5f0bc8 | 833 | &spa->spa_scrub_errors, tx)); |
b128c09f BB |
834 | |
835 | /* XXX this is scrub-clean specific */ | |
fb5f0bc8 BB |
836 | mutex_enter(&spa->spa_scrub_lock); |
837 | while (spa->spa_scrub_inflight > 0) | |
838 | cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); | |
839 | mutex_exit(&spa->spa_scrub_lock); | |
b128c09f BB |
840 | } |
841 | ||
842 | void | |
843 | dsl_pool_scrub_restart(dsl_pool_t *dp) | |
844 | { | |
845 | mutex_enter(&dp->dp_scrub_cancel_lock); | |
846 | dp->dp_scrub_restart = B_TRUE; | |
847 | mutex_exit(&dp->dp_scrub_cancel_lock); | |
848 | } | |
849 | ||
850 | /* | |
851 | * scrub consumers | |
852 | */ | |
853 | ||
854 | static void | |
855 | count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) | |
856 | { | |
857 | int i; | |
858 | ||
859 | /* | |
860 | * If we resume after a reboot, zab will be NULL; don't record | |
861 | * incomplete stats in that case. | |
862 | */ | |
863 | if (zab == NULL) | |
864 | return; | |
865 | ||
866 | for (i = 0; i < 4; i++) { | |
867 | int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; | |
868 | int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; | |
869 | zfs_blkstat_t *zb = &zab->zab_type[l][t]; | |
870 | int equal; | |
871 | ||
872 | zb->zb_count++; | |
873 | zb->zb_asize += BP_GET_ASIZE(bp); | |
874 | zb->zb_lsize += BP_GET_LSIZE(bp); | |
875 | zb->zb_psize += BP_GET_PSIZE(bp); | |
876 | zb->zb_gangs += BP_COUNT_GANG(bp); | |
877 | ||
878 | switch (BP_GET_NDVAS(bp)) { | |
879 | case 2: | |
880 | if (DVA_GET_VDEV(&bp->blk_dva[0]) == | |
881 | DVA_GET_VDEV(&bp->blk_dva[1])) | |
882 | zb->zb_ditto_2_of_2_samevdev++; | |
883 | break; | |
884 | case 3: | |
885 | equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == | |
886 | DVA_GET_VDEV(&bp->blk_dva[1])) + | |
887 | (DVA_GET_VDEV(&bp->blk_dva[0]) == | |
888 | DVA_GET_VDEV(&bp->blk_dva[2])) + | |
889 | (DVA_GET_VDEV(&bp->blk_dva[1]) == | |
890 | DVA_GET_VDEV(&bp->blk_dva[2])); | |
891 | if (equal == 1) | |
892 | zb->zb_ditto_2_of_3_samevdev++; | |
893 | else if (equal == 3) | |
894 | zb->zb_ditto_3_of_3_samevdev++; | |
895 | break; | |
896 | } | |
897 | } | |
898 | } | |
899 | ||
900 | static void | |
901 | dsl_pool_scrub_clean_done(zio_t *zio) | |
902 | { | |
903 | spa_t *spa = zio->io_spa; | |
904 | ||
905 | zio_data_buf_free(zio->io_data, zio->io_size); | |
906 | ||
907 | mutex_enter(&spa->spa_scrub_lock); | |
908 | spa->spa_scrub_inflight--; | |
909 | cv_broadcast(&spa->spa_scrub_io_cv); | |
910 | ||
911 | if (zio->io_error && (zio->io_error != ECKSUM || | |
912 | !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) | |
913 | spa->spa_scrub_errors++; | |
914 | mutex_exit(&spa->spa_scrub_lock); | |
915 | } | |
916 | ||
917 | static int | |
918 | dsl_pool_scrub_clean_cb(dsl_pool_t *dp, | |
919 | const blkptr_t *bp, const zbookmark_t *zb) | |
920 | { | |
fb5f0bc8 | 921 | size_t size = BP_GET_PSIZE(bp); |
b128c09f BB |
922 | spa_t *spa = dp->dp_spa; |
923 | boolean_t needs_io; | |
fb5f0bc8 | 924 | int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; |
b128c09f BB |
925 | int zio_priority; |
926 | ||
fb5f0bc8 BB |
927 | ASSERT(bp->blk_birth > dp->dp_scrub_min_txg); |
928 | ||
929 | if (bp->blk_birth >= dp->dp_scrub_max_txg) | |
930 | return (0); | |
931 | ||
b128c09f BB |
932 | count_block(dp->dp_blkstats, bp); |
933 | ||
934 | if (dp->dp_scrub_isresilver == 0) { | |
935 | /* It's a scrub */ | |
936 | zio_flags |= ZIO_FLAG_SCRUB; | |
937 | zio_priority = ZIO_PRIORITY_SCRUB; | |
938 | needs_io = B_TRUE; | |
939 | } else { | |
940 | /* It's a resilver */ | |
941 | zio_flags |= ZIO_FLAG_RESILVER; | |
942 | zio_priority = ZIO_PRIORITY_RESILVER; | |
943 | needs_io = B_FALSE; | |
944 | } | |
945 | ||
946 | /* If it's an intent log block, failure is expected. */ | |
947 | if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) | |
948 | zio_flags |= ZIO_FLAG_SPECULATIVE; | |
949 | ||
fb5f0bc8 | 950 | for (int d = 0; d < BP_GET_NDVAS(bp); d++) { |
b128c09f BB |
951 | vdev_t *vd = vdev_lookup_top(spa, |
952 | DVA_GET_VDEV(&bp->blk_dva[d])); | |
953 | ||
954 | /* | |
955 | * Keep track of how much data we've examined so that | |
956 | * zpool(1M) status can make useful progress reports. | |
957 | */ | |
958 | mutex_enter(&vd->vdev_stat_lock); | |
959 | vd->vdev_stat.vs_scrub_examined += | |
960 | DVA_GET_ASIZE(&bp->blk_dva[d]); | |
961 | mutex_exit(&vd->vdev_stat_lock); | |
962 | ||
963 | /* if it's a resilver, this may not be in the target range */ | |
964 | if (!needs_io) { | |
965 | if (DVA_GET_GANG(&bp->blk_dva[d])) { | |
966 | /* | |
967 | * Gang members may be spread across multiple | |
fb5f0bc8 BB |
968 | * vdevs, so the best estimate we have is the |
969 | * scrub range, which has already been checked. | |
b128c09f | 970 | * XXX -- it would be better to change our |
fb5f0bc8 BB |
971 | * allocation policy to ensure that all |
972 | * gang members reside on the same vdev. | |
b128c09f | 973 | */ |
fb5f0bc8 BB |
974 | needs_io = B_TRUE; |
975 | } else { | |
976 | needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, | |
977 | bp->blk_birth, 1); | |
b128c09f | 978 | } |
b128c09f BB |
979 | } |
980 | } | |
981 | ||
982 | if (needs_io && !zfs_no_scrub_io) { | |
983 | void *data = zio_data_buf_alloc(size); | |
984 | ||
985 | mutex_enter(&spa->spa_scrub_lock); | |
986 | while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) | |
987 | cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); | |
988 | spa->spa_scrub_inflight++; | |
989 | mutex_exit(&spa->spa_scrub_lock); | |
990 | ||
991 | zio_nowait(zio_read(NULL, spa, bp, data, size, | |
992 | dsl_pool_scrub_clean_done, NULL, zio_priority, | |
993 | zio_flags, zb)); | |
994 | } | |
995 | ||
996 | /* do not relocate this block */ | |
997 | return (0); | |
998 | } | |
999 | ||
1000 | int | |
1001 | dsl_pool_scrub_clean(dsl_pool_t *dp) | |
1002 | { | |
1003 | /* | |
1004 | * Purge all vdev caches. We do this here rather than in sync | |
1005 | * context because this requires a writer lock on the spa_config | |
1006 | * lock, which we can't do from sync context. The | |
1007 | * spa_scrub_reopen flag indicates that vdev_open() should not | |
1008 | * attempt to start another scrub. | |
1009 | */ | |
1010 | spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); | |
1011 | dp->dp_spa->spa_scrub_reopen = B_TRUE; | |
1012 | vdev_reopen(dp->dp_spa->spa_root_vdev); | |
1013 | dp->dp_spa->spa_scrub_reopen = B_FALSE; | |
1014 | spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); | |
1015 | ||
1016 | return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); | |
1017 | } |