]> git.proxmox.com Git - mirror_zfs-debian.git/blame - module/zfs/dsl_scan.c
Fix gcc array subscript above bounds warning
[mirror_zfs-debian.git] / module / zfs / dsl_scan.c
CommitLineData
428870ff
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/dsl_scan.h>
26#include <sys/dsl_pool.h>
27#include <sys/dsl_dataset.h>
28#include <sys/dsl_prop.h>
29#include <sys/dsl_dir.h>
30#include <sys/dsl_synctask.h>
31#include <sys/dnode.h>
32#include <sys/dmu_tx.h>
33#include <sys/dmu_objset.h>
34#include <sys/arc.h>
35#include <sys/zap.h>
36#include <sys/zio.h>
37#include <sys/zfs_context.h>
38#include <sys/fs/zfs.h>
39#include <sys/zfs_znode.h>
40#include <sys/spa_impl.h>
41#include <sys/vdev_impl.h>
42#include <sys/zil_impl.h>
43#include <sys/zio_checksum.h>
44#include <sys/ddt.h>
45#include <sys/sa.h>
46#include <sys/sa_impl.h>
47#ifdef _KERNEL
48#include <sys/zfs_vfsops.h>
49#endif
50
51typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
52
428870ff 53static scan_cb_t dsl_scan_scrub_cb;
428870ff
BB
54static dsl_syncfunc_t dsl_scan_cancel_sync;
55static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
56
572e2857
BB
57int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
58int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
59int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
60int zfs_scan_idle = 50; /* idle window in clock ticks */
61
428870ff
BB
62int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
63int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
64int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
c409e464
BB
65int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
66int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
428870ff
BB
67enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
68int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
69
70#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
71 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
72 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
73
428870ff
BB
74/* the order has to match pool_scan_type */
75static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
76 NULL,
77 dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
78 dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
79};
80
81int
82dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
83{
84 int err;
85 dsl_scan_t *scn;
86 spa_t *spa = dp->dp_spa;
87 uint64_t f;
88
89 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
90 scn->scn_dp = dp;
91
92 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
93 "scrub_func", sizeof (uint64_t), 1, &f);
94 if (err == 0) {
95 /*
96 * There was an old-style scrub in progress. Restart a
97 * new-style scrub from the beginning.
98 */
99 scn->scn_restart_txg = txg;
100 zfs_dbgmsg("old-style scrub was in progress; "
101 "restarting new-style scrub in txg %llu",
102 scn->scn_restart_txg);
103
104 /*
105 * Load the queue obj from the old location so that it
106 * can be freed by dsl_scan_done().
107 */
108 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
109 "scrub_queue", sizeof (uint64_t), 1,
110 &scn->scn_phys.scn_queue_obj);
111 } else {
112 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
113 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
114 &scn->scn_phys);
115 if (err == ENOENT)
116 return (0);
117 else if (err)
118 return (err);
119
120 if (scn->scn_phys.scn_state == DSS_SCANNING &&
121 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
122 /*
123 * A new-type scrub was in progress on an old
124 * pool, and the pool was accessed by old
125 * software. Restart from the beginning, since
126 * the old software may have changed the pool in
127 * the meantime.
128 */
129 scn->scn_restart_txg = txg;
130 zfs_dbgmsg("new-style scrub was modified "
131 "by old software; restarting in txg %llu",
132 scn->scn_restart_txg);
133 }
134 }
135
136 spa_scan_stat_init(spa);
137 return (0);
138}
139
140void
141dsl_scan_fini(dsl_pool_t *dp)
142{
143 if (dp->dp_scan) {
144 kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
145 dp->dp_scan = NULL;
146 }
147}
148
149/* ARGSUSED */
150static int
151dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
152{
153 dsl_scan_t *scn = arg1;
154
155 if (scn->scn_phys.scn_state == DSS_SCANNING)
156 return (EBUSY);
157
158 return (0);
159}
160
161/* ARGSUSED */
162static void
163dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
164{
165 dsl_scan_t *scn = arg1;
166 pool_scan_func_t *funcp = arg2;
167 dmu_object_type_t ot = 0;
168 dsl_pool_t *dp = scn->scn_dp;
169 spa_t *spa = dp->dp_spa;
170
171 ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
172 ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
173 bzero(&scn->scn_phys, sizeof (scn->scn_phys));
174 scn->scn_phys.scn_func = *funcp;
175 scn->scn_phys.scn_state = DSS_SCANNING;
176 scn->scn_phys.scn_min_txg = 0;
177 scn->scn_phys.scn_max_txg = tx->tx_txg;
178 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
179 scn->scn_phys.scn_start_time = gethrestime_sec();
180 scn->scn_phys.scn_errors = 0;
181 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
182 scn->scn_restart_txg = 0;
183 spa_scan_stat_init(spa);
184
185 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
186 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
187
188 /* rewrite all disk labels */
189 vdev_config_dirty(spa->spa_root_vdev);
190
191 if (vdev_resilver_needed(spa->spa_root_vdev,
192 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
26685276 193 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START);
428870ff 194 } else {
26685276 195 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START);
428870ff
BB
196 }
197
198 spa->spa_scrub_started = B_TRUE;
199 /*
200 * If this is an incremental scrub, limit the DDT scrub phase
201 * to just the auto-ditto class (for correctness); the rest
202 * of the scrub should go faster using top-down pruning.
203 */
204 if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
205 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
206
207 }
208
209 /* back to the generic stuff */
210
211 if (dp->dp_blkstats == NULL) {
398f129c 212 dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t),
20a083cb 213 KM_PUSHPAGE | KM_NODEBUG);
428870ff
BB
214 }
215 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
216
217 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
218 ot = DMU_OT_ZAP_OTHER;
219
220 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
221 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
222
223 dsl_scan_sync_state(scn, tx);
224
225 spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
226 "func=%u mintxg=%llu maxtxg=%llu",
227 *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
228}
229
230/* ARGSUSED */
231static void
232dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
233{
234 static const char *old_names[] = {
235 "scrub_bookmark",
236 "scrub_ddt_bookmark",
237 "scrub_ddt_class_max",
238 "scrub_queue",
239 "scrub_min_txg",
240 "scrub_max_txg",
241 "scrub_func",
242 "scrub_errors",
243 NULL
244 };
245
246 dsl_pool_t *dp = scn->scn_dp;
247 spa_t *spa = dp->dp_spa;
248 int i;
249
250 /* Remove any remnants of an old-style scrub. */
251 for (i = 0; old_names[i]; i++) {
252 (void) zap_remove(dp->dp_meta_objset,
253 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
254 }
255
256 if (scn->scn_phys.scn_queue_obj != 0) {
257 VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
258 scn->scn_phys.scn_queue_obj, tx));
259 scn->scn_phys.scn_queue_obj = 0;
260 }
261
262 /*
263 * If we were "restarted" from a stopped state, don't bother
264 * with anything else.
265 */
266 if (scn->scn_phys.scn_state != DSS_SCANNING)
267 return;
268
269 if (complete)
270 scn->scn_phys.scn_state = DSS_FINISHED;
271 else
272 scn->scn_phys.scn_state = DSS_CANCELED;
273
274 spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
275 "complete=%u", complete);
276
277 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
278 mutex_enter(&spa->spa_scrub_lock);
279 while (spa->spa_scrub_inflight > 0) {
280 cv_wait(&spa->spa_scrub_io_cv,
281 &spa->spa_scrub_lock);
282 }
283 mutex_exit(&spa->spa_scrub_lock);
284 spa->spa_scrub_started = B_FALSE;
285 spa->spa_scrub_active = B_FALSE;
286
287 /*
288 * If the scrub/resilver completed, update all DTLs to
289 * reflect this. Whether it succeeded or not, vacate
290 * all temporary scrub DTLs.
291 */
292 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
293 complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
294 if (complete) {
295 spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
26685276
BB
296 FM_EREPORT_ZFS_RESILVER_FINISH :
297 FM_EREPORT_ZFS_SCRUB_FINISH);
428870ff
BB
298 }
299 spa_errlog_rotate(spa);
300
301 /*
302 * We may have finished replacing a device.
303 * Let the async thread assess this and handle the detach.
304 */
305 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
306 }
307
308 scn->scn_phys.scn_end_time = gethrestime_sec();
309}
310
311/* ARGSUSED */
312static int
313dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
314{
315 dsl_scan_t *scn = arg1;
316
317 if (scn->scn_phys.scn_state != DSS_SCANNING)
318 return (ENOENT);
319 return (0);
320}
321
322/* ARGSUSED */
323static void
324dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
325{
326 dsl_scan_t *scn = arg1;
327
328 dsl_scan_done(scn, B_FALSE, tx);
329 dsl_scan_sync_state(scn, tx);
330}
331
332int
333dsl_scan_cancel(dsl_pool_t *dp)
334{
335 boolean_t complete = B_FALSE;
336 int err;
337
338 err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
339 dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
340 return (err);
341}
342
343static void dsl_scan_visitbp(blkptr_t *bp,
344 const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
345 dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
346 dmu_tx_t *tx);
10be533e
BB
347inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
348 dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
428870ff
BB
349 dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
350
351void
352dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
353{
354 zio_free(dp->dp_spa, txg, bp);
355}
356
357void
358dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
359{
360 ASSERT(dsl_pool_sync_context(dp));
361 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
362}
363
364int
365dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
366 arc_done_func_t *done, void *private, int priority, int zio_flags,
367 uint32_t *arc_flags, const zbookmark_t *zb)
368{
369 return (arc_read(pio, spa, bpp, pbuf, done, private,
370 priority, zio_flags, arc_flags, zb));
371}
372
373int
374dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
375 arc_done_func_t *done, void *private, int priority, int zio_flags,
376 uint32_t *arc_flags, const zbookmark_t *zb)
377{
378 return (arc_read_nolock(pio, spa, bpp, done, private,
379 priority, zio_flags, arc_flags, zb));
380}
381
382static boolean_t
383bookmark_is_zero(const zbookmark_t *zb)
384{
385 return (zb->zb_objset == 0 && zb->zb_object == 0 &&
386 zb->zb_level == 0 && zb->zb_blkid == 0);
387}
388
389/* dnp is the dnode for zb1->zb_object */
390static boolean_t
391bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
392 const zbookmark_t *zb2)
393{
394 uint64_t zb1nextL0, zb2thisobj;
395
396 ASSERT(zb1->zb_objset == zb2->zb_objset);
397 ASSERT(zb2->zb_level == 0);
398
399 /*
400 * A bookmark in the deadlist is considered to be after
401 * everything else.
402 */
403 if (zb2->zb_object == DMU_DEADLIST_OBJECT)
404 return (B_TRUE);
405
406 /* The objset_phys_t isn't before anything. */
407 if (dnp == NULL)
408 return (B_FALSE);
409
410 zb1nextL0 = (zb1->zb_blkid + 1) <<
411 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
412
413 zb2thisobj = zb2->zb_object ? zb2->zb_object :
414 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
415
416 if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
417 uint64_t nextobj = zb1nextL0 *
418 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
419 return (nextobj <= zb2thisobj);
420 }
421
422 if (zb1->zb_object < zb2thisobj)
423 return (B_TRUE);
424 if (zb1->zb_object > zb2thisobj)
425 return (B_FALSE);
426 if (zb2->zb_object == DMU_META_DNODE_OBJECT)
427 return (B_FALSE);
428 return (zb1nextL0 <= zb2->zb_blkid);
429}
430
431static uint64_t
432dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
433{
434 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
435 if (dsl_dataset_is_snapshot(ds))
436 return (MIN(smt, ds->ds_phys->ds_creation_txg));
437 return (smt);
438}
439
440static void
441dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
442{
443 VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
444 DMU_POOL_DIRECTORY_OBJECT,
445 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
446 &scn->scn_phys, tx));
447}
448
449static boolean_t
450dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
451{
452 uint64_t elapsed_nanosecs;
453 int mintime;
454
455 /* we never skip user/group accounting objects */
456 if (zb && (int64_t)zb->zb_object < 0)
457 return (B_FALSE);
458
459 if (scn->scn_pausing)
460 return (B_TRUE); /* we're already pausing */
461
462 if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
463 return (B_FALSE); /* we're resuming */
464
465 /* We only know how to resume from level-0 blocks. */
466 if (zb && zb->zb_level != 0)
467 return (B_FALSE);
468
469 mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
470 zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
471 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
472 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
473 (elapsed_nanosecs / MICROSEC > mintime &&
474 txg_sync_waiting(scn->scn_dp)) ||
475 spa_shutting_down(scn->scn_dp->dp_spa)) {
476 if (zb) {
477 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
478 (longlong_t)zb->zb_objset,
479 (longlong_t)zb->zb_object,
480 (longlong_t)zb->zb_level,
481 (longlong_t)zb->zb_blkid);
482 scn->scn_phys.scn_bookmark = *zb;
483 }
484 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
485 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
486 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
487 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
488 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
489 scn->scn_pausing = B_TRUE;
490 return (B_TRUE);
491 }
492 return (B_FALSE);
493}
494
495typedef struct zil_scan_arg {
496 dsl_pool_t *zsa_dp;
497 zil_header_t *zsa_zh;
498} zil_scan_arg_t;
499
500/* ARGSUSED */
501static int
502dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
503{
504 zil_scan_arg_t *zsa = arg;
505 dsl_pool_t *dp = zsa->zsa_dp;
506 dsl_scan_t *scn = dp->dp_scan;
507 zil_header_t *zh = zsa->zsa_zh;
508 zbookmark_t zb;
509
510 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
511 return (0);
512
513 /*
514 * One block ("stubby") can be allocated a long time ago; we
515 * want to visit that one because it has been allocated
516 * (on-disk) even if it hasn't been claimed (even though for
517 * scrub there's nothing to do to it).
518 */
519 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
520 return (0);
521
522 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
523 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
524
525 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
526 return (0);
527}
528
529/* ARGSUSED */
530static int
531dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
532{
533 if (lrc->lrc_txtype == TX_WRITE) {
534 zil_scan_arg_t *zsa = arg;
535 dsl_pool_t *dp = zsa->zsa_dp;
536 dsl_scan_t *scn = dp->dp_scan;
537 zil_header_t *zh = zsa->zsa_zh;
538 lr_write_t *lr = (lr_write_t *)lrc;
539 blkptr_t *bp = &lr->lr_blkptr;
540 zbookmark_t zb;
541
542 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
543 return (0);
544
545 /*
546 * birth can be < claim_txg if this record's txg is
547 * already txg sync'ed (but this log block contains
548 * other records that are not synced)
549 */
550 if (claim_txg == 0 || bp->blk_birth < claim_txg)
551 return (0);
552
553 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
554 lr->lr_foid, ZB_ZIL_LEVEL,
555 lr->lr_offset / BP_GET_LSIZE(bp));
556
557 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
558 }
559 return (0);
560}
561
562static void
563dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
564{
565 uint64_t claim_txg = zh->zh_claim_txg;
566 zil_scan_arg_t zsa = { dp, zh };
567 zilog_t *zilog;
568
569 /*
570 * We only want to visit blocks that have been claimed but not yet
571 * replayed (or, in read-only mode, blocks that *would* be claimed).
572 */
573 if (claim_txg == 0 && spa_writeable(dp->dp_spa))
574 return;
575
576 zilog = zil_alloc(dp->dp_meta_objset, zh);
577
578 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
579 claim_txg);
580
581 zil_free(zilog);
582}
583
584/* ARGSUSED */
585static void
586dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
587 uint64_t objset, uint64_t object, uint64_t blkid)
588{
589 zbookmark_t czb;
590 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
591
592 if (zfs_no_scrub_prefetch)
593 return;
594
595 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
596 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
597 return;
598
599 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
600
601 /*
602 * XXX need to make sure all of these arc_read() prefetches are
603 * done before setting xlateall (similar to dsl_read())
604 */
605 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
572e2857
BB
606 buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
607 ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
428870ff
BB
608}
609
610static boolean_t
611dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
612 const zbookmark_t *zb)
613{
614 /*
615 * We never skip over user/group accounting objects (obj<0)
616 */
617 if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
618 (int64_t)zb->zb_object >= 0) {
619 /*
620 * If we already visited this bp & everything below (in
621 * a prior txg sync), don't bother doing it again.
622 */
623 if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
624 return (B_TRUE);
625
626 /*
627 * If we found the block we're trying to resume from, or
628 * we went past it to a different object, zero it out to
629 * indicate that it's OK to start checking for pausing
630 * again.
631 */
632 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
633 zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
634 dprintf("resuming at %llx/%llx/%llx/%llx\n",
635 (longlong_t)zb->zb_objset,
636 (longlong_t)zb->zb_object,
637 (longlong_t)zb->zb_level,
638 (longlong_t)zb->zb_blkid);
639 bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
640 }
641 }
642 return (B_FALSE);
643}
644
645/*
646 * Return nonzero on i/o error.
647 * Return new buf to write out in *bufp.
648 */
10be533e 649inline __attribute__((always_inline)) static int
428870ff
BB
650dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
651 dnode_phys_t *dnp, const blkptr_t *bp,
652 const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
653{
654 dsl_pool_t *dp = scn->scn_dp;
572e2857 655 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
428870ff
BB
656 int err;
657
658 if (BP_GET_LEVEL(bp) > 0) {
659 uint32_t flags = ARC_WAIT;
660 int i;
661 blkptr_t *cbp;
662 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
663
664 err = arc_read_nolock(NULL, dp->dp_spa, bp,
665 arc_getbuf_func, bufp,
572e2857 666 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
667 if (err) {
668 scn->scn_phys.scn_errors++;
669 return (err);
670 }
671 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
672 dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
673 zb->zb_object, zb->zb_blkid * epb + i);
674 }
675 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
676 zbookmark_t czb;
677
678 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
679 zb->zb_level - 1,
680 zb->zb_blkid * epb + i);
681 dsl_scan_visitbp(cbp, &czb, dnp,
682 *bufp, ds, scn, ostype, tx);
683 }
684 } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
685 uint32_t flags = ARC_WAIT;
686
687 err = arc_read_nolock(NULL, dp->dp_spa, bp,
688 arc_getbuf_func, bufp,
572e2857 689 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
690 if (err) {
691 scn->scn_phys.scn_errors++;
692 return (err);
693 }
694 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
695 uint32_t flags = ARC_WAIT;
696 dnode_phys_t *cdnp;
697 int i, j;
698 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
699
700 err = arc_read_nolock(NULL, dp->dp_spa, bp,
701 arc_getbuf_func, bufp,
572e2857 702 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
703 if (err) {
704 scn->scn_phys.scn_errors++;
705 return (err);
706 }
707 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
708 for (j = 0; j < cdnp->dn_nblkptr; j++) {
709 blkptr_t *cbp = &cdnp->dn_blkptr[j];
710 dsl_scan_prefetch(scn, *bufp, cbp,
711 zb->zb_objset, zb->zb_blkid * epb + i, j);
712 }
713 }
714 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
715 dsl_scan_visitdnode(scn, ds, ostype,
716 cdnp, *bufp, zb->zb_blkid * epb + i, tx);
717 }
718
719 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
720 uint32_t flags = ARC_WAIT;
721 objset_phys_t *osp;
722
723 err = arc_read_nolock(NULL, dp->dp_spa, bp,
724 arc_getbuf_func, bufp,
572e2857 725 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
726 if (err) {
727 scn->scn_phys.scn_errors++;
728 return (err);
729 }
730
731 osp = (*bufp)->b_data;
732
428870ff
BB
733 dsl_scan_visitdnode(scn, ds, osp->os_type,
734 &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
735
736 if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
737 /*
738 * We also always visit user/group accounting
739 * objects, and never skip them, even if we are
740 * pausing. This is necessary so that the space
741 * deltas from this txg get integrated.
742 */
743 dsl_scan_visitdnode(scn, ds, osp->os_type,
744 &osp->os_groupused_dnode, *bufp,
745 DMU_GROUPUSED_OBJECT, tx);
746 dsl_scan_visitdnode(scn, ds, osp->os_type,
747 &osp->os_userused_dnode, *bufp,
748 DMU_USERUSED_OBJECT, tx);
749 }
750 }
751
752 return (0);
753}
754
10be533e 755inline __attribute__((always_inline)) static void
428870ff
BB
756dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
757 dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
758 uint64_t object, dmu_tx_t *tx)
759{
760 int j;
761
762 for (j = 0; j < dnp->dn_nblkptr; j++) {
763 zbookmark_t czb;
764
765 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
766 dnp->dn_nlevels - 1, j);
767 dsl_scan_visitbp(&dnp->dn_blkptr[j],
768 &czb, dnp, buf, ds, scn, ostype, tx);
769 }
770
771 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
772 zbookmark_t czb;
773 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
774 0, DMU_SPILL_BLKID);
775 dsl_scan_visitbp(&dnp->dn_spill,
776 &czb, dnp, buf, ds, scn, ostype, tx);
777 }
778}
779
780/*
781 * The arguments are in this order because mdb can only print the
782 * first 5; we want them to be useful.
783 */
784static void
785dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
786 dnode_phys_t *dnp, arc_buf_t *pbuf,
787 dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
788 dmu_tx_t *tx)
789{
790 dsl_pool_t *dp = scn->scn_dp;
791 arc_buf_t *buf = NULL;
161ce7ce
BB
792 blkptr_t *bp_toread;
793
20a083cb 794 bp_toread = kmem_alloc(sizeof (blkptr_t), KM_PUSHPAGE);
161ce7ce 795 *bp_toread = *bp;
428870ff
BB
796
797 /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
798
799 if (dsl_scan_check_pause(scn, zb))
161ce7ce 800 goto out;
428870ff
BB
801
802 if (dsl_scan_check_resume(scn, dnp, zb))
161ce7ce 803 goto out;
428870ff
BB
804
805 if (bp->blk_birth == 0)
161ce7ce 806 goto out;
428870ff
BB
807
808 scn->scn_visited_this_txg++;
809
b81c4ac9
BB
810 /*
811 * This debugging is commented out to conserve stack space. This
812 * function is called recursively and the debugging addes several
813 * bytes to the stack for each call. It can be commented back in
814 * if required to debug an issue in dsl_scan_visitbp().
815 *
816 * dprintf_bp(bp,
817 * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
818 * ds, ds ? ds->ds_object : 0,
819 * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
820 * pbuf, bp);
821 */
428870ff
BB
822
823 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
161ce7ce 824 goto out;
428870ff
BB
825
826 if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
827 /*
828 * For non-user-accounting blocks, we need to read the
829 * new bp (from a deleted snapshot, found in
830 * check_existing_xlation). If we used the old bp,
831 * pointers inside this block from before we resumed
832 * would be untranslated.
833 *
834 * For user-accounting blocks, we need to read the old
835 * bp, because we will apply the entire space delta to
836 * it (original untranslated -> translations from
837 * deleted snap -> now).
838 */
161ce7ce 839 *bp_toread = *bp;
428870ff
BB
840 }
841
161ce7ce 842 if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx,
428870ff 843 &buf) != 0)
161ce7ce 844 goto out;
428870ff
BB
845
846 /*
847 * If dsl_scan_ddt() has aready visited this block, it will have
848 * already done any translations or scrubbing, so don't call the
849 * callback again.
850 */
851 if (ddt_class_contains(dp->dp_spa,
852 scn->scn_phys.scn_ddt_class_max, bp)) {
853 ASSERT(buf == NULL);
161ce7ce 854 goto out;
428870ff
BB
855 }
856
857 /*
858 * If this block is from the future (after cur_max_txg), then we
859 * are doing this on behalf of a deleted snapshot, and we will
860 * revisit the future block on the next pass of this dataset.
861 * Don't scan it now unless we need to because something
862 * under it was modified.
863 */
864 if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
865 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
866 }
867 if (buf)
868 (void) arc_buf_remove_ref(buf, &buf);
161ce7ce
BB
869out:
870 kmem_free(bp_toread, sizeof(blkptr_t));
428870ff
BB
871}
872
873static void
874dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
875 dmu_tx_t *tx)
876{
877 zbookmark_t zb;
878
879 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
880 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
881 dsl_scan_visitbp(bp, &zb, NULL, NULL,
882 ds, scn, DMU_OST_NONE, tx);
883
884 dprintf_ds(ds, "finished scan%s", "");
885}
886
887void
888dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
889{
890 dsl_pool_t *dp = ds->ds_dir->dd_pool;
891 dsl_scan_t *scn = dp->dp_scan;
892 uint64_t mintxg;
893
894 if (scn->scn_phys.scn_state != DSS_SCANNING)
895 return;
896
897 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
898 if (dsl_dataset_is_snapshot(ds)) {
899 /* Note, scn_cur_{min,max}_txg stays the same. */
900 scn->scn_phys.scn_bookmark.zb_objset =
901 ds->ds_phys->ds_next_snap_obj;
902 zfs_dbgmsg("destroying ds %llu; currently traversing; "
903 "reset zb_objset to %llu",
904 (u_longlong_t)ds->ds_object,
905 (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
906 scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
907 } else {
908 SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
909 ZB_DESTROYED_OBJSET, 0, 0, 0);
910 zfs_dbgmsg("destroying ds %llu; currently traversing; "
911 "reset bookmark to -1,0,0,0",
912 (u_longlong_t)ds->ds_object);
913 }
914 } else if (zap_lookup_int_key(dp->dp_meta_objset,
915 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
916 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
917 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
918 scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
919 if (dsl_dataset_is_snapshot(ds)) {
920 /*
921 * We keep the same mintxg; it could be >
922 * ds_creation_txg if the previous snapshot was
923 * deleted too.
924 */
925 VERIFY(zap_add_int_key(dp->dp_meta_objset,
926 scn->scn_phys.scn_queue_obj,
927 ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
928 zfs_dbgmsg("destroying ds %llu; in queue; "
929 "replacing with %llu",
930 (u_longlong_t)ds->ds_object,
931 (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
932 } else {
933 zfs_dbgmsg("destroying ds %llu; in queue; removing",
934 (u_longlong_t)ds->ds_object);
935 }
936 } else {
937 zfs_dbgmsg("destroying ds %llu; ignoring",
938 (u_longlong_t)ds->ds_object);
939 }
940
941 /*
942 * dsl_scan_sync() should be called after this, and should sync
943 * out our changed state, but just to be safe, do it here.
944 */
945 dsl_scan_sync_state(scn, tx);
946}
947
948void
949dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
950{
951 dsl_pool_t *dp = ds->ds_dir->dd_pool;
952 dsl_scan_t *scn = dp->dp_scan;
953 uint64_t mintxg;
954
955 if (scn->scn_phys.scn_state != DSS_SCANNING)
956 return;
957
958 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
959
960 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
961 scn->scn_phys.scn_bookmark.zb_objset =
962 ds->ds_phys->ds_prev_snap_obj;
963 zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
964 "reset zb_objset to %llu",
965 (u_longlong_t)ds->ds_object,
966 (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
967 } else if (zap_lookup_int_key(dp->dp_meta_objset,
968 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
969 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
970 scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
971 VERIFY(zap_add_int_key(dp->dp_meta_objset,
972 scn->scn_phys.scn_queue_obj,
973 ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
974 zfs_dbgmsg("snapshotting ds %llu; in queue; "
975 "replacing with %llu",
976 (u_longlong_t)ds->ds_object,
977 (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
978 }
979 dsl_scan_sync_state(scn, tx);
980}
981
982void
983dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
984{
985 dsl_pool_t *dp = ds1->ds_dir->dd_pool;
986 dsl_scan_t *scn = dp->dp_scan;
987 uint64_t mintxg;
988
989 if (scn->scn_phys.scn_state != DSS_SCANNING)
990 return;
991
992 if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
993 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
994 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
995 "reset zb_objset to %llu",
996 (u_longlong_t)ds1->ds_object,
997 (u_longlong_t)ds2->ds_object);
998 } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
999 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
1000 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
1001 "reset zb_objset to %llu",
1002 (u_longlong_t)ds2->ds_object,
1003 (u_longlong_t)ds1->ds_object);
1004 }
1005
1006 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1007 ds1->ds_object, &mintxg) == 0) {
1008 int err;
1009
1010 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
1011 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
1012 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1013 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
1014 err = zap_add_int_key(dp->dp_meta_objset,
1015 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
1016 VERIFY(err == 0 || err == EEXIST);
1017 if (err == EEXIST) {
1018 /* Both were there to begin with */
1019 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
1020 scn->scn_phys.scn_queue_obj,
1021 ds1->ds_object, mintxg, tx));
1022 }
1023 zfs_dbgmsg("clone_swap ds %llu; in queue; "
1024 "replacing with %llu",
1025 (u_longlong_t)ds1->ds_object,
1026 (u_longlong_t)ds2->ds_object);
1027 } else if (zap_lookup_int_key(dp->dp_meta_objset,
1028 scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
1029 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
1030 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
1031 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1032 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
1033 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
1034 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
1035 zfs_dbgmsg("clone_swap ds %llu; in queue; "
1036 "replacing with %llu",
1037 (u_longlong_t)ds2->ds_object,
1038 (u_longlong_t)ds1->ds_object);
1039 }
1040
1041 dsl_scan_sync_state(scn, tx);
1042}
1043
1044struct enqueue_clones_arg {
1045 dmu_tx_t *tx;
1046 uint64_t originobj;
1047};
1048
1049/* ARGSUSED */
1050static int
1051enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
1052{
1053 struct enqueue_clones_arg *eca = arg;
1054 dsl_dataset_t *ds;
1055 int err;
1056 dsl_pool_t *dp = spa->spa_dsl_pool;
1057 dsl_scan_t *scn = dp->dp_scan;
1058
1059 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
1060 if (err)
1061 return (err);
1062
1063 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
1064 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
1065 dsl_dataset_t *prev;
1066 err = dsl_dataset_hold_obj(dp,
1067 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
1068
1069 dsl_dataset_rele(ds, FTAG);
1070 if (err)
1071 return (err);
1072 ds = prev;
1073 }
1074 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1075 scn->scn_phys.scn_queue_obj, ds->ds_object,
1076 ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
1077 }
1078 dsl_dataset_rele(ds, FTAG);
1079 return (0);
1080}
1081
1082static void
1083dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
1084{
1085 dsl_pool_t *dp = scn->scn_dp;
1086 dsl_dataset_t *ds;
572e2857 1087 objset_t *os;
d6320ddb 1088 char *dsname;
428870ff
BB
1089
1090 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1091
572e2857
BB
1092 if (dmu_objset_from_ds(ds, &os))
1093 goto out;
1094
1095 /*
1096 * Only the ZIL in the head (non-snapshot) is valid. Even though
1097 * snapshots can have ZIL block pointers (which may be the same
1098 * BP as in the head), they must be ignored. So we traverse the
1099 * ZIL here, rather than in scan_recurse(), because the regular
1100 * snapshot block-sharing rules don't apply to it.
1101 */
1102 if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
1103 dsl_scan_zil(dp, &os->os_zil_header);
1104
428870ff
BB
1105 /*
1106 * Iterate over the bps in this ds.
1107 */
1108 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1109 dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
1110
20a083cb 1111 dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_PUSHPAGE);
428870ff
BB
1112 dsl_dataset_name(ds, dsname);
1113 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
1114 "pausing=%u",
1115 (longlong_t)dsobj, dsname,
1116 (longlong_t)scn->scn_phys.scn_cur_min_txg,
1117 (longlong_t)scn->scn_phys.scn_cur_max_txg,
1118 (int)scn->scn_pausing);
1119 kmem_free(dsname, ZFS_MAXNAMELEN);
1120
1121 if (scn->scn_pausing)
1122 goto out;
1123
1124 /*
1125 * We've finished this pass over this dataset.
1126 */
1127
1128 /*
1129 * If we did not completely visit this dataset, do another pass.
1130 */
1131 if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
1132 zfs_dbgmsg("incomplete pass; visiting again");
1133 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
1134 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1135 scn->scn_phys.scn_queue_obj, ds->ds_object,
1136 scn->scn_phys.scn_cur_max_txg, tx) == 0);
1137 goto out;
1138 }
1139
1140 /*
1141 * Add descendent datasets to work queue.
1142 */
1143 if (ds->ds_phys->ds_next_snap_obj != 0) {
1144 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1145 scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
1146 ds->ds_phys->ds_creation_txg, tx) == 0);
1147 }
1148 if (ds->ds_phys->ds_num_children > 1) {
1149 boolean_t usenext = B_FALSE;
1150 if (ds->ds_phys->ds_next_clones_obj != 0) {
1151 uint64_t count;
1152 /*
1153 * A bug in a previous version of the code could
1154 * cause upgrade_clones_cb() to not set
1155 * ds_next_snap_obj when it should, leading to a
1156 * missing entry. Therefore we can only use the
1157 * next_clones_obj when its count is correct.
1158 */
1159 int err = zap_count(dp->dp_meta_objset,
1160 ds->ds_phys->ds_next_clones_obj, &count);
1161 if (err == 0 &&
1162 count == ds->ds_phys->ds_num_children - 1)
1163 usenext = B_TRUE;
1164 }
1165
1166 if (usenext) {
1167 VERIFY(zap_join_key(dp->dp_meta_objset,
1168 ds->ds_phys->ds_next_clones_obj,
1169 scn->scn_phys.scn_queue_obj,
1170 ds->ds_phys->ds_creation_txg, tx) == 0);
1171 } else {
1172 struct enqueue_clones_arg eca;
1173 eca.tx = tx;
1174 eca.originobj = ds->ds_object;
1175
1176 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
1177 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
1178 }
1179 }
1180
1181out:
1182 dsl_dataset_rele(ds, FTAG);
1183}
1184
1185/* ARGSUSED */
1186static int
1187enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
1188{
1189 dmu_tx_t *tx = arg;
1190 dsl_dataset_t *ds;
1191 int err;
1192 dsl_pool_t *dp = spa->spa_dsl_pool;
1193 dsl_scan_t *scn = dp->dp_scan;
1194
1195 err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
1196 if (err)
1197 return (err);
1198
1199 while (ds->ds_phys->ds_prev_snap_obj != 0) {
1200 dsl_dataset_t *prev;
1201 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
1202 FTAG, &prev);
1203 if (err) {
1204 dsl_dataset_rele(ds, FTAG);
1205 return (err);
1206 }
1207
1208 /*
1209 * If this is a clone, we don't need to worry about it for now.
1210 */
1211 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
1212 dsl_dataset_rele(ds, FTAG);
1213 dsl_dataset_rele(prev, FTAG);
1214 return (0);
1215 }
1216 dsl_dataset_rele(ds, FTAG);
1217 ds = prev;
1218 }
1219
1220 VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1221 ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
1222 dsl_dataset_rele(ds, FTAG);
1223 return (0);
1224}
1225
1226/*
1227 * Scrub/dedup interaction.
1228 *
1229 * If there are N references to a deduped block, we don't want to scrub it
1230 * N times -- ideally, we should scrub it exactly once.
1231 *
1232 * We leverage the fact that the dde's replication class (enum ddt_class)
1233 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
1234 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
1235 *
1236 * To prevent excess scrubbing, the scrub begins by walking the DDT
1237 * to find all blocks with refcnt > 1, and scrubs each of these once.
1238 * Since there are two replication classes which contain blocks with
1239 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
1240 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
1241 *
1242 * There would be nothing more to say if a block's refcnt couldn't change
1243 * during a scrub, but of course it can so we must account for changes
1244 * in a block's replication class.
1245 *
1246 * Here's an example of what can occur:
1247 *
1248 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
1249 * when visited during the top-down scrub phase, it will be scrubbed twice.
1250 * This negates our scrub optimization, but is otherwise harmless.
1251 *
1252 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
1253 * on each visit during the top-down scrub phase, it will never be scrubbed.
1254 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
1255 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
1256 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
1257 * while a scrub is in progress, it scrubs the block right then.
1258 */
1259static void
1260dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
1261{
1262 ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
2598c001 1263 ddt_entry_t dde;
428870ff
BB
1264 int error;
1265 uint64_t n = 0;
1266
2598c001
BB
1267 bzero(&dde, sizeof (ddt_entry_t));
1268
428870ff
BB
1269 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
1270 ddt_t *ddt;
1271
1272 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
1273 break;
1274 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
1275 (longlong_t)ddb->ddb_class,
1276 (longlong_t)ddb->ddb_type,
1277 (longlong_t)ddb->ddb_checksum,
1278 (longlong_t)ddb->ddb_cursor);
1279
1280 /* There should be no pending changes to the dedup table */
1281 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
1282 ASSERT(avl_first(&ddt->ddt_tree) == NULL);
1283
1284 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
1285 n++;
1286
1287 if (dsl_scan_check_pause(scn, NULL))
1288 break;
1289 }
1290
1291 zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
1292 (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
1293 (int)scn->scn_pausing);
1294
1295 ASSERT(error == 0 || error == ENOENT);
1296 ASSERT(error != ENOENT ||
1297 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
1298}
1299
1300/* ARGSUSED */
1301void
1302dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
1303 ddt_entry_t *dde, dmu_tx_t *tx)
1304{
1305 const ddt_key_t *ddk = &dde->dde_key;
1306 ddt_phys_t *ddp = dde->dde_phys;
1307 blkptr_t bp;
1308 zbookmark_t zb = { 0 };
d6320ddb 1309 int p;
428870ff
BB
1310
1311 if (scn->scn_phys.scn_state != DSS_SCANNING)
1312 return;
1313
d6320ddb 1314 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
428870ff
BB
1315 if (ddp->ddp_phys_birth == 0 ||
1316 ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
1317 continue;
1318 ddt_bp_create(checksum, ddk, ddp, &bp);
1319
1320 scn->scn_visited_this_txg++;
1321 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
1322 }
1323}
1324
1325static void
1326dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
1327{
1328 dsl_pool_t *dp = scn->scn_dp;
40a39e11
BB
1329 zap_cursor_t *zc;
1330 zap_attribute_t *za;
428870ff
BB
1331
1332 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1333 scn->scn_phys.scn_ddt_class_max) {
1334 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1335 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1336 dsl_scan_ddt(scn, tx);
1337 if (scn->scn_pausing)
1338 return;
1339 }
1340
1341 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
1342 /* First do the MOS & ORIGIN */
1343
1344 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1345 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1346 dsl_scan_visit_rootbp(scn, NULL,
1347 &dp->dp_meta_rootbp, tx);
1348 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
1349 if (scn->scn_pausing)
1350 return;
1351
1352 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
1353 VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
1354 NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
1355 } else {
1356 dsl_scan_visitds(scn,
1357 dp->dp_origin_snap->ds_object, tx);
1358 }
1359 ASSERT(!scn->scn_pausing);
1360 } else if (scn->scn_phys.scn_bookmark.zb_objset !=
1361 ZB_DESTROYED_OBJSET) {
1362 /*
1363 * If we were paused, continue from here. Note if the
1364 * ds we were paused on was deleted, the zb_objset may
1365 * be -1, so we will skip this and find a new objset
1366 * below.
1367 */
1368 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
1369 if (scn->scn_pausing)
1370 return;
1371 }
1372
1373 /*
1374 * In case we were paused right at the end of the ds, zero the
1375 * bookmark so we don't think that we're still trying to resume.
1376 */
1377 bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
20a083cb
CD
1378 zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE);
1379 za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE);
428870ff
BB
1380
1381 /* keep pulling things out of the zap-object-as-queue */
40a39e11 1382 while (zap_cursor_init(zc, dp->dp_meta_objset,
428870ff 1383 scn->scn_phys.scn_queue_obj),
40a39e11 1384 zap_cursor_retrieve(zc, za) == 0) {
428870ff
BB
1385 dsl_dataset_t *ds;
1386 uint64_t dsobj;
1387
40a39e11 1388 dsobj = strtonum(za->za_name, NULL);
428870ff
BB
1389 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1390 scn->scn_phys.scn_queue_obj, dsobj, tx));
1391
1392 /* Set up min/max txg */
1393 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
40a39e11 1394 if (za->za_first_integer != 0) {
428870ff
BB
1395 scn->scn_phys.scn_cur_min_txg =
1396 MAX(scn->scn_phys.scn_min_txg,
40a39e11 1397 za->za_first_integer);
428870ff
BB
1398 } else {
1399 scn->scn_phys.scn_cur_min_txg =
1400 MAX(scn->scn_phys.scn_min_txg,
1401 ds->ds_phys->ds_prev_snap_txg);
1402 }
1403 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1404 dsl_dataset_rele(ds, FTAG);
1405
1406 dsl_scan_visitds(scn, dsobj, tx);
40a39e11 1407 zap_cursor_fini(zc);
428870ff 1408 if (scn->scn_pausing)
40a39e11 1409 goto out;
428870ff 1410 }
40a39e11
BB
1411 zap_cursor_fini(zc);
1412out:
1413 kmem_free(za, sizeof(zap_attribute_t));
1414 kmem_free(zc, sizeof(zap_cursor_t));
428870ff
BB
1415}
1416
1417static int
1418dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1419{
1420 dsl_scan_t *scn = arg;
1421 uint64_t elapsed_nanosecs;
1422
1423 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1424
1425 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1426 (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1427 txg_sync_waiting(scn->scn_dp)) ||
1428 spa_shutting_down(scn->scn_dp->dp_spa))
1429 return (ERESTART);
1430
1431 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1432 dmu_tx_get_txg(tx), bp, 0));
1433 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1434 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1435 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1436 scn->scn_visited_this_txg++;
1437 return (0);
1438}
1439
1440boolean_t
1441dsl_scan_active(dsl_scan_t *scn)
1442{
1443 spa_t *spa = scn->scn_dp->dp_spa;
1444 uint64_t used = 0, comp, uncomp;
1445
1446 if (spa->spa_load_state != SPA_LOAD_NONE)
1447 return (B_FALSE);
1448 if (spa_shutting_down(spa))
1449 return (B_FALSE);
1450
1451 if (scn->scn_phys.scn_state == DSS_SCANNING)
1452 return (B_TRUE);
1453
1454 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1455 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1456 &used, &comp, &uncomp);
1457 }
1458 return (used != 0);
1459}
1460
1461void
1462dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1463{
1464 dsl_scan_t *scn = dp->dp_scan;
1465 spa_t *spa = dp->dp_spa;
1466 int err;
1467
1468 /*
1469 * Check for scn_restart_txg before checking spa_load_state, so
1470 * that we can restart an old-style scan while the pool is being
1471 * imported (see dsl_scan_init).
1472 */
1473 if (scn->scn_restart_txg != 0 &&
1474 scn->scn_restart_txg <= tx->tx_txg) {
1475 pool_scan_func_t func = POOL_SCAN_SCRUB;
1476 dsl_scan_done(scn, B_FALSE, tx);
1477 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
1478 func = POOL_SCAN_RESILVER;
1479 zfs_dbgmsg("restarting scan func=%u txg=%llu",
1480 func, tx->tx_txg);
1481 dsl_scan_setup_sync(scn, &func, tx);
1482 }
1483
428870ff
BB
1484 if (!dsl_scan_active(scn) ||
1485 spa_sync_pass(dp->dp_spa) > 1)
1486 return;
1487
1488 scn->scn_visited_this_txg = 0;
1489 scn->scn_pausing = B_FALSE;
1490 scn->scn_sync_start_time = gethrtime();
1491 spa->spa_scrub_active = B_TRUE;
1492
1493 /*
1494 * First process the free list. If we pause the free, don't do
1495 * any scanning. This ensures that there is no free list when
1496 * we are scanning, so the scan code doesn't have to worry about
1497 * traversing it.
1498 */
1499 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1500 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1501 NULL, ZIO_FLAG_MUSTSUCCEED);
1502 err = bpobj_iterate(&dp->dp_free_bpobj,
1503 dsl_scan_free_cb, scn, tx);
1504 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1505 if (scn->scn_visited_this_txg) {
1506 zfs_dbgmsg("freed %llu blocks in %llums from "
1507 "free_bpobj txg %llu",
1508 (longlong_t)scn->scn_visited_this_txg,
1509 (longlong_t)
1510 (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1511 (longlong_t)tx->tx_txg);
1512 scn->scn_visited_this_txg = 0;
1513 /*
1514 * Re-sync the ddt so that we can further modify
1515 * it when doing bprewrite.
1516 */
1517 ddt_sync(spa, tx->tx_txg);
1518 }
1519 if (err == ERESTART)
1520 return;
1521 }
1522
1523 if (scn->scn_phys.scn_state != DSS_SCANNING)
1524 return;
1525
428870ff
BB
1526 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1527 scn->scn_phys.scn_ddt_class_max) {
1528 zfs_dbgmsg("doing scan sync txg %llu; "
1529 "ddt bm=%llu/%llu/%llu/%llx",
1530 (longlong_t)tx->tx_txg,
1531 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
1532 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
1533 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
1534 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
1535 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
1536 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
1537 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
1538 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
1539 } else {
1540 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
1541 (longlong_t)tx->tx_txg,
1542 (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
1543 (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
1544 (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
1545 (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
1546 }
1547
1548 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1549 NULL, ZIO_FLAG_CANFAIL);
1550 dsl_scan_visit(scn, tx);
1551 (void) zio_wait(scn->scn_zio_root);
1552 scn->scn_zio_root = NULL;
1553
1554 zfs_dbgmsg("visited %llu blocks in %llums",
1555 (longlong_t)scn->scn_visited_this_txg,
1556 (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
1557
1558 if (!scn->scn_pausing) {
1559 /* finished with scan. */
1560 zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
1561 dsl_scan_done(scn, B_TRUE, tx);
1562 }
1563
1564 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
1565 mutex_enter(&spa->spa_scrub_lock);
1566 while (spa->spa_scrub_inflight > 0) {
1567 cv_wait(&spa->spa_scrub_io_cv,
1568 &spa->spa_scrub_lock);
1569 }
1570 mutex_exit(&spa->spa_scrub_lock);
1571 }
1572
1573 dsl_scan_sync_state(scn, tx);
1574}
1575
1576/*
1577 * This will start a new scan, or restart an existing one.
1578 */
1579void
1580dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
1581{
1582 if (txg == 0) {
1583 dmu_tx_t *tx;
1584 tx = dmu_tx_create_dd(dp->dp_mos_dir);
1585 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
1586
1587 txg = dmu_tx_get_txg(tx);
1588 dp->dp_scan->scn_restart_txg = txg;
1589 dmu_tx_commit(tx);
1590 } else {
1591 dp->dp_scan->scn_restart_txg = txg;
1592 }
1593 zfs_dbgmsg("restarting resilver txg=%llu", txg);
1594}
1595
1596boolean_t
1597dsl_scan_resilvering(dsl_pool_t *dp)
1598{
1599 return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
1600 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
1601}
1602
1603/*
1604 * scrub consumers
1605 */
1606
1607static void
1608count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1609{
1610 int i;
1611
1612 /*
1613 * If we resume after a reboot, zab will be NULL; don't record
1614 * incomplete stats in that case.
1615 */
1616 if (zab == NULL)
1617 return;
1618
1619 for (i = 0; i < 4; i++) {
1620 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1621 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1622 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1623 int equal;
1624
1625 zb->zb_count++;
1626 zb->zb_asize += BP_GET_ASIZE(bp);
1627 zb->zb_lsize += BP_GET_LSIZE(bp);
1628 zb->zb_psize += BP_GET_PSIZE(bp);
1629 zb->zb_gangs += BP_COUNT_GANG(bp);
1630
1631 switch (BP_GET_NDVAS(bp)) {
1632 case 2:
1633 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1634 DVA_GET_VDEV(&bp->blk_dva[1]))
1635 zb->zb_ditto_2_of_2_samevdev++;
1636 break;
1637 case 3:
1638 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1639 DVA_GET_VDEV(&bp->blk_dva[1])) +
1640 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1641 DVA_GET_VDEV(&bp->blk_dva[2])) +
1642 (DVA_GET_VDEV(&bp->blk_dva[1]) ==
1643 DVA_GET_VDEV(&bp->blk_dva[2]));
1644 if (equal == 1)
1645 zb->zb_ditto_2_of_3_samevdev++;
1646 else if (equal == 3)
1647 zb->zb_ditto_3_of_3_samevdev++;
1648 break;
1649 }
1650 }
1651}
1652
1653static void
1654dsl_scan_scrub_done(zio_t *zio)
1655{
1656 spa_t *spa = zio->io_spa;
1657
1658 zio_data_buf_free(zio->io_data, zio->io_size);
1659
1660 mutex_enter(&spa->spa_scrub_lock);
1661 spa->spa_scrub_inflight--;
1662 cv_broadcast(&spa->spa_scrub_io_cv);
1663
1664 if (zio->io_error && (zio->io_error != ECKSUM ||
1665 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
1666 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
1667 }
1668 mutex_exit(&spa->spa_scrub_lock);
1669}
1670
1671static int
1672dsl_scan_scrub_cb(dsl_pool_t *dp,
1673 const blkptr_t *bp, const zbookmark_t *zb)
1674{
1675 dsl_scan_t *scn = dp->dp_scan;
1676 size_t size = BP_GET_PSIZE(bp);
1677 spa_t *spa = dp->dp_spa;
1678 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
d6320ddb 1679 boolean_t needs_io = B_FALSE;
572e2857 1680 int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
d6320ddb 1681 int zio_priority = 0;
572e2857 1682 int scan_delay = 0;
d6320ddb 1683 int d;
428870ff
BB
1684
1685 if (phys_birth <= scn->scn_phys.scn_min_txg ||
1686 phys_birth >= scn->scn_phys.scn_max_txg)
1687 return (0);
1688
1689 count_block(dp->dp_blkstats, bp);
1690
1691 ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
1692 if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
1693 zio_flags |= ZIO_FLAG_SCRUB;
1694 zio_priority = ZIO_PRIORITY_SCRUB;
1695 needs_io = B_TRUE;
572e2857 1696 scan_delay = zfs_scrub_delay;
428870ff
BB
1697 } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
1698 zio_flags |= ZIO_FLAG_RESILVER;
1699 zio_priority = ZIO_PRIORITY_RESILVER;
1700 needs_io = B_FALSE;
572e2857 1701 scan_delay = zfs_resilver_delay;
428870ff
BB
1702 }
1703
1704 /* If it's an intent log block, failure is expected. */
1705 if (zb->zb_level == ZB_ZIL_LEVEL)
1706 zio_flags |= ZIO_FLAG_SPECULATIVE;
1707
d6320ddb 1708 for (d = 0; d < BP_GET_NDVAS(bp); d++) {
428870ff
BB
1709 vdev_t *vd = vdev_lookup_top(spa,
1710 DVA_GET_VDEV(&bp->blk_dva[d]));
1711
1712 /*
1713 * Keep track of how much data we've examined so that
1714 * zpool(1M) status can make useful progress reports.
1715 */
1716 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
1717 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
1718
1719 /* if it's a resilver, this may not be in the target range */
1720 if (!needs_io) {
1721 if (DVA_GET_GANG(&bp->blk_dva[d])) {
1722 /*
1723 * Gang members may be spread across multiple
1724 * vdevs, so the best estimate we have is the
1725 * scrub range, which has already been checked.
1726 * XXX -- it would be better to change our
1727 * allocation policy to ensure that all
1728 * gang members reside on the same vdev.
1729 */
1730 needs_io = B_TRUE;
1731 } else {
1732 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
1733 phys_birth, 1);
1734 }
1735 }
1736 }
1737
1738 if (needs_io && !zfs_no_scrub_io) {
572e2857
BB
1739 vdev_t *rvd = spa->spa_root_vdev;
1740 uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
428870ff
BB
1741 void *data = zio_data_buf_alloc(size);
1742
1743 mutex_enter(&spa->spa_scrub_lock);
572e2857 1744 while (spa->spa_scrub_inflight >= maxinflight)
428870ff
BB
1745 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1746 spa->spa_scrub_inflight++;
1747 mutex_exit(&spa->spa_scrub_lock);
1748
572e2857
BB
1749 /*
1750 * If we're seeing recent (zfs_scan_idle) "important" I/Os
1751 * then throttle our workload to limit the impact of a scan.
1752 */
1753 if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
1754 delay(scan_delay);
1755
428870ff
BB
1756 zio_nowait(zio_read(NULL, spa, bp, data, size,
1757 dsl_scan_scrub_done, NULL, zio_priority,
1758 zio_flags, zb));
1759 }
1760
1761 /* do not relocate this block */
1762 return (0);
1763}
1764
1765int
1766dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
1767{
1768 spa_t *spa = dp->dp_spa;
1769
1770 /*
1771 * Purge all vdev caches and probe all devices. We do this here
1772 * rather than in sync context because this requires a writer lock
1773 * on the spa_config lock, which we can't do from sync context. The
1774 * spa_scrub_reopen flag indicates that vdev_open() should not
1775 * attempt to start another scrub.
1776 */
1777 spa_vdev_state_enter(spa, SCL_NONE);
1778 spa->spa_scrub_reopen = B_TRUE;
1779 vdev_reopen(spa->spa_root_vdev);
1780 spa->spa_scrub_reopen = B_FALSE;
1781 (void) spa_vdev_state_exit(spa, NULL, 0);
1782
1783 return (dsl_sync_task_do(dp, dsl_scan_setup_check,
1784 dsl_scan_setup_sync, dp->dp_scan, &func, 0));
1785}
c409e464
BB
1786
1787#if defined(_KERNEL) && defined(HAVE_SPL)
1788module_param(zfs_top_maxinflight, int, 0644);
1789MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level");
1790
1791module_param(zfs_resilver_delay, int, 0644);
1792MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver");
1793
1794module_param(zfs_scrub_delay, int, 0644);
1795MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub");
1796
1797module_param(zfs_scan_idle, int, 0644);
1798MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks");
1799
1800module_param(zfs_scan_min_time_ms, int, 0644);
1801MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg");
1802
1803module_param(zfs_free_min_time_ms, int, 0644);
1804MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
1805
1806module_param(zfs_resilver_min_time_ms, int, 0644);
1807MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg");
1808
1809module_param(zfs_no_scrub_io, int, 0644);
1810MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O");
1811
1812module_param(zfs_no_scrub_prefetch, int, 0644);
1813MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching");
c409e464 1814#endif