]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dsl_scan.c
6977619 NULL pointer deference in sa_handle_get_from_db()
[mirror_zfs.git] / module / zfs / dsl_scan.c
CommitLineData
428870ff
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
2e528b49 23 * Copyright (c) 2013 by Delphix. All rights reserved.
428870ff
BB
24 */
25
26#include <sys/dsl_scan.h>
27#include <sys/dsl_pool.h>
28#include <sys/dsl_dataset.h>
29#include <sys/dsl_prop.h>
30#include <sys/dsl_dir.h>
31#include <sys/dsl_synctask.h>
32#include <sys/dnode.h>
33#include <sys/dmu_tx.h>
34#include <sys/dmu_objset.h>
35#include <sys/arc.h>
36#include <sys/zap.h>
37#include <sys/zio.h>
38#include <sys/zfs_context.h>
39#include <sys/fs/zfs.h>
40#include <sys/zfs_znode.h>
41#include <sys/spa_impl.h>
42#include <sys/vdev_impl.h>
43#include <sys/zil_impl.h>
44#include <sys/zio_checksum.h>
45#include <sys/ddt.h>
46#include <sys/sa.h>
47#include <sys/sa_impl.h>
9ae529ec 48#include <sys/zfeature.h>
428870ff
BB
49#ifdef _KERNEL
50#include <sys/zfs_vfsops.h>
51#endif
52
53typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
54
428870ff 55static scan_cb_t dsl_scan_scrub_cb;
13fe0198 56static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
428870ff
BB
57static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
58
572e2857
BB
59int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
60int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
61int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
62int zfs_scan_idle = 50; /* idle window in clock ticks */
63
428870ff
BB
64int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
65int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
66int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
c409e464
BB
67int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
68int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
428870ff
BB
69enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
70int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
71
72#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
73 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
74 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
75
428870ff
BB
76/* the order has to match pool_scan_type */
77static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
78 NULL,
79 dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
80 dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
81};
82
83int
84dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
85{
86 int err;
87 dsl_scan_t *scn;
88 spa_t *spa = dp->dp_spa;
89 uint64_t f;
90
91 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
92 scn->scn_dp = dp;
93
2696dfaf
GW
94 /*
95 * It's possible that we're resuming a scan after a reboot so
96 * make sure that the scan_async_destroying flag is initialized
97 * appropriately.
98 */
99 ASSERT(!scn->scn_async_destroying);
100 scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
101 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
102
428870ff
BB
103 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
104 "scrub_func", sizeof (uint64_t), 1, &f);
105 if (err == 0) {
106 /*
107 * There was an old-style scrub in progress. Restart a
108 * new-style scrub from the beginning.
109 */
110 scn->scn_restart_txg = txg;
111 zfs_dbgmsg("old-style scrub was in progress; "
112 "restarting new-style scrub in txg %llu",
113 scn->scn_restart_txg);
114
115 /*
116 * Load the queue obj from the old location so that it
117 * can be freed by dsl_scan_done().
118 */
119 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
120 "scrub_queue", sizeof (uint64_t), 1,
121 &scn->scn_phys.scn_queue_obj);
122 } else {
123 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
124 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
125 &scn->scn_phys);
126 if (err == ENOENT)
127 return (0);
128 else if (err)
129 return (err);
130
131 if (scn->scn_phys.scn_state == DSS_SCANNING &&
132 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
133 /*
134 * A new-type scrub was in progress on an old
135 * pool, and the pool was accessed by old
136 * software. Restart from the beginning, since
137 * the old software may have changed the pool in
138 * the meantime.
139 */
140 scn->scn_restart_txg = txg;
141 zfs_dbgmsg("new-style scrub was modified "
142 "by old software; restarting in txg %llu",
143 scn->scn_restart_txg);
144 }
145 }
146
147 spa_scan_stat_init(spa);
148 return (0);
149}
150
151void
152dsl_scan_fini(dsl_pool_t *dp)
153{
154 if (dp->dp_scan) {
155 kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
156 dp->dp_scan = NULL;
157 }
158}
159
160/* ARGSUSED */
161static int
13fe0198 162dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
428870ff 163{
13fe0198 164 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
428870ff
BB
165
166 if (scn->scn_phys.scn_state == DSS_SCANNING)
2e528b49 167 return (SET_ERROR(EBUSY));
428870ff
BB
168
169 return (0);
170}
171
428870ff 172static void
13fe0198 173dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
428870ff 174{
13fe0198
MA
175 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
176 pool_scan_func_t *funcp = arg;
428870ff
BB
177 dmu_object_type_t ot = 0;
178 dsl_pool_t *dp = scn->scn_dp;
179 spa_t *spa = dp->dp_spa;
180
181 ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
182 ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
183 bzero(&scn->scn_phys, sizeof (scn->scn_phys));
184 scn->scn_phys.scn_func = *funcp;
185 scn->scn_phys.scn_state = DSS_SCANNING;
186 scn->scn_phys.scn_min_txg = 0;
187 scn->scn_phys.scn_max_txg = tx->tx_txg;
188 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
189 scn->scn_phys.scn_start_time = gethrestime_sec();
190 scn->scn_phys.scn_errors = 0;
191 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
192 scn->scn_restart_txg = 0;
193 spa_scan_stat_init(spa);
194
195 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
196 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
197
198 /* rewrite all disk labels */
199 vdev_config_dirty(spa->spa_root_vdev);
200
201 if (vdev_resilver_needed(spa->spa_root_vdev,
202 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
26685276 203 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START);
428870ff 204 } else {
26685276 205 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START);
428870ff
BB
206 }
207
208 spa->spa_scrub_started = B_TRUE;
209 /*
210 * If this is an incremental scrub, limit the DDT scrub phase
211 * to just the auto-ditto class (for correctness); the rest
212 * of the scrub should go faster using top-down pruning.
213 */
214 if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
215 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
216
217 }
218
219 /* back to the generic stuff */
220
221 if (dp->dp_blkstats == NULL) {
398f129c 222 dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t),
20a083cb 223 KM_PUSHPAGE | KM_NODEBUG);
428870ff
BB
224 }
225 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
226
227 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
228 ot = DMU_OT_ZAP_OTHER;
229
230 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
231 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
232
233 dsl_scan_sync_state(scn, tx);
234
6f1ffb06 235 spa_history_log_internal(spa, "scan setup", tx,
428870ff
BB
236 "func=%u mintxg=%llu maxtxg=%llu",
237 *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
238}
239
240/* ARGSUSED */
241static void
242dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
243{
244 static const char *old_names[] = {
245 "scrub_bookmark",
246 "scrub_ddt_bookmark",
247 "scrub_ddt_class_max",
248 "scrub_queue",
249 "scrub_min_txg",
250 "scrub_max_txg",
251 "scrub_func",
252 "scrub_errors",
253 NULL
254 };
255
256 dsl_pool_t *dp = scn->scn_dp;
257 spa_t *spa = dp->dp_spa;
258 int i;
259
260 /* Remove any remnants of an old-style scrub. */
261 for (i = 0; old_names[i]; i++) {
262 (void) zap_remove(dp->dp_meta_objset,
263 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
264 }
265
266 if (scn->scn_phys.scn_queue_obj != 0) {
267 VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
268 scn->scn_phys.scn_queue_obj, tx));
269 scn->scn_phys.scn_queue_obj = 0;
270 }
271
272 /*
273 * If we were "restarted" from a stopped state, don't bother
274 * with anything else.
275 */
276 if (scn->scn_phys.scn_state != DSS_SCANNING)
277 return;
278
279 if (complete)
280 scn->scn_phys.scn_state = DSS_FINISHED;
281 else
282 scn->scn_phys.scn_state = DSS_CANCELED;
283
6f1ffb06 284 spa_history_log_internal(spa, "scan done", tx,
428870ff
BB
285 "complete=%u", complete);
286
287 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
288 mutex_enter(&spa->spa_scrub_lock);
289 while (spa->spa_scrub_inflight > 0) {
290 cv_wait(&spa->spa_scrub_io_cv,
291 &spa->spa_scrub_lock);
292 }
293 mutex_exit(&spa->spa_scrub_lock);
294 spa->spa_scrub_started = B_FALSE;
295 spa->spa_scrub_active = B_FALSE;
296
297 /*
298 * If the scrub/resilver completed, update all DTLs to
299 * reflect this. Whether it succeeded or not, vacate
300 * all temporary scrub DTLs.
301 */
302 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
303 complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
304 if (complete) {
305 spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
26685276
BB
306 FM_EREPORT_ZFS_RESILVER_FINISH :
307 FM_EREPORT_ZFS_SCRUB_FINISH);
428870ff
BB
308 }
309 spa_errlog_rotate(spa);
310
311 /*
312 * We may have finished replacing a device.
313 * Let the async thread assess this and handle the detach.
314 */
315 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
316 }
317
318 scn->scn_phys.scn_end_time = gethrestime_sec();
319}
320
321/* ARGSUSED */
322static int
13fe0198 323dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
428870ff 324{
13fe0198 325 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
428870ff
BB
326
327 if (scn->scn_phys.scn_state != DSS_SCANNING)
2e528b49 328 return (SET_ERROR(ENOENT));
428870ff
BB
329 return (0);
330}
331
332/* ARGSUSED */
333static void
13fe0198 334dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
428870ff 335{
13fe0198 336 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
428870ff
BB
337
338 dsl_scan_done(scn, B_FALSE, tx);
339 dsl_scan_sync_state(scn, tx);
340}
341
342int
343dsl_scan_cancel(dsl_pool_t *dp)
344{
13fe0198
MA
345 return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
346 dsl_scan_cancel_sync, NULL, 3));
428870ff
BB
347}
348
349static void dsl_scan_visitbp(blkptr_t *bp,
350 const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
351 dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
352 dmu_tx_t *tx);
10be533e
BB
353inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
354 dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
428870ff
BB
355 dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
356
357void
358dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
359{
360 zio_free(dp->dp_spa, txg, bp);
361}
362
363void
364dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
365{
366 ASSERT(dsl_pool_sync_context(dp));
367 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
368}
369
428870ff
BB
370static uint64_t
371dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
372{
373 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
374 if (dsl_dataset_is_snapshot(ds))
375 return (MIN(smt, ds->ds_phys->ds_creation_txg));
376 return (smt);
377}
378
379static void
380dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
381{
13fe0198 382 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
428870ff
BB
383 DMU_POOL_DIRECTORY_OBJECT,
384 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
385 &scn->scn_phys, tx));
386}
387
388static boolean_t
389dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
390{
391 uint64_t elapsed_nanosecs;
392 int mintime;
393
394 /* we never skip user/group accounting objects */
395 if (zb && (int64_t)zb->zb_object < 0)
396 return (B_FALSE);
397
398 if (scn->scn_pausing)
399 return (B_TRUE); /* we're already pausing */
400
9ae529ec 401 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
428870ff
BB
402 return (B_FALSE); /* we're resuming */
403
404 /* We only know how to resume from level-0 blocks. */
405 if (zb && zb->zb_level != 0)
406 return (B_FALSE);
407
408 mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
409 zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
410 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
411 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
412 (elapsed_nanosecs / MICROSEC > mintime &&
413 txg_sync_waiting(scn->scn_dp)) ||
414 spa_shutting_down(scn->scn_dp->dp_spa)) {
415 if (zb) {
416 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
417 (longlong_t)zb->zb_objset,
418 (longlong_t)zb->zb_object,
419 (longlong_t)zb->zb_level,
420 (longlong_t)zb->zb_blkid);
421 scn->scn_phys.scn_bookmark = *zb;
422 }
423 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
424 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
425 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
426 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
427 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
428 scn->scn_pausing = B_TRUE;
429 return (B_TRUE);
430 }
431 return (B_FALSE);
432}
433
434typedef struct zil_scan_arg {
435 dsl_pool_t *zsa_dp;
436 zil_header_t *zsa_zh;
437} zil_scan_arg_t;
438
439/* ARGSUSED */
440static int
441dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
442{
443 zil_scan_arg_t *zsa = arg;
444 dsl_pool_t *dp = zsa->zsa_dp;
445 dsl_scan_t *scn = dp->dp_scan;
446 zil_header_t *zh = zsa->zsa_zh;
447 zbookmark_t zb;
448
449 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
450 return (0);
451
452 /*
453 * One block ("stubby") can be allocated a long time ago; we
454 * want to visit that one because it has been allocated
455 * (on-disk) even if it hasn't been claimed (even though for
456 * scrub there's nothing to do to it).
457 */
458 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
459 return (0);
460
461 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
462 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
463
464 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
465 return (0);
466}
467
468/* ARGSUSED */
469static int
470dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
471{
472 if (lrc->lrc_txtype == TX_WRITE) {
473 zil_scan_arg_t *zsa = arg;
474 dsl_pool_t *dp = zsa->zsa_dp;
475 dsl_scan_t *scn = dp->dp_scan;
476 zil_header_t *zh = zsa->zsa_zh;
477 lr_write_t *lr = (lr_write_t *)lrc;
478 blkptr_t *bp = &lr->lr_blkptr;
479 zbookmark_t zb;
480
481 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
482 return (0);
483
484 /*
485 * birth can be < claim_txg if this record's txg is
486 * already txg sync'ed (but this log block contains
487 * other records that are not synced)
488 */
489 if (claim_txg == 0 || bp->blk_birth < claim_txg)
490 return (0);
491
492 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
493 lr->lr_foid, ZB_ZIL_LEVEL,
494 lr->lr_offset / BP_GET_LSIZE(bp));
495
496 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
497 }
498 return (0);
499}
500
501static void
502dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
503{
504 uint64_t claim_txg = zh->zh_claim_txg;
505 zil_scan_arg_t zsa = { dp, zh };
506 zilog_t *zilog;
507
508 /*
509 * We only want to visit blocks that have been claimed but not yet
510 * replayed (or, in read-only mode, blocks that *would* be claimed).
511 */
512 if (claim_txg == 0 && spa_writeable(dp->dp_spa))
513 return;
514
515 zilog = zil_alloc(dp->dp_meta_objset, zh);
516
517 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
518 claim_txg);
519
520 zil_free(zilog);
521}
522
523/* ARGSUSED */
524static void
525dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
526 uint64_t objset, uint64_t object, uint64_t blkid)
527{
528 zbookmark_t czb;
529 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
530
531 if (zfs_no_scrub_prefetch)
532 return;
533
534 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
535 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
536 return;
537
538 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
539
428870ff 540 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
294f6806 541 NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
572e2857 542 ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
428870ff
BB
543}
544
545static boolean_t
546dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
547 const zbookmark_t *zb)
548{
549 /*
550 * We never skip over user/group accounting objects (obj<0)
551 */
9ae529ec 552 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
428870ff
BB
553 (int64_t)zb->zb_object >= 0) {
554 /*
555 * If we already visited this bp & everything below (in
556 * a prior txg sync), don't bother doing it again.
557 */
9ae529ec 558 if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
428870ff
BB
559 return (B_TRUE);
560
561 /*
562 * If we found the block we're trying to resume from, or
563 * we went past it to a different object, zero it out to
564 * indicate that it's OK to start checking for pausing
565 * again.
566 */
567 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
568 zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
569 dprintf("resuming at %llx/%llx/%llx/%llx\n",
570 (longlong_t)zb->zb_objset,
571 (longlong_t)zb->zb_object,
572 (longlong_t)zb->zb_level,
573 (longlong_t)zb->zb_blkid);
574 bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
575 }
576 }
577 return (B_FALSE);
578}
579
580/*
581 * Return nonzero on i/o error.
582 * Return new buf to write out in *bufp.
583 */
10be533e 584inline __attribute__((always_inline)) static int
428870ff
BB
585dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
586 dnode_phys_t *dnp, const blkptr_t *bp,
587 const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
588{
589 dsl_pool_t *dp = scn->scn_dp;
572e2857 590 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
428870ff
BB
591 int err;
592
593 if (BP_GET_LEVEL(bp) > 0) {
594 uint32_t flags = ARC_WAIT;
595 int i;
596 blkptr_t *cbp;
597 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
598
294f6806 599 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
572e2857 600 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
601 if (err) {
602 scn->scn_phys.scn_errors++;
603 return (err);
604 }
605 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
606 dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
607 zb->zb_object, zb->zb_blkid * epb + i);
608 }
609 for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
610 zbookmark_t czb;
611
612 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
613 zb->zb_level - 1,
614 zb->zb_blkid * epb + i);
615 dsl_scan_visitbp(cbp, &czb, dnp,
616 *bufp, ds, scn, ostype, tx);
617 }
618 } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
619 uint32_t flags = ARC_WAIT;
620
294f6806 621 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
572e2857 622 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
623 if (err) {
624 scn->scn_phys.scn_errors++;
625 return (err);
626 }
627 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
628 uint32_t flags = ARC_WAIT;
629 dnode_phys_t *cdnp;
630 int i, j;
631 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
632
294f6806 633 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
572e2857 634 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
635 if (err) {
636 scn->scn_phys.scn_errors++;
637 return (err);
638 }
639 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
640 for (j = 0; j < cdnp->dn_nblkptr; j++) {
641 blkptr_t *cbp = &cdnp->dn_blkptr[j];
642 dsl_scan_prefetch(scn, *bufp, cbp,
643 zb->zb_objset, zb->zb_blkid * epb + i, j);
644 }
645 }
646 for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
647 dsl_scan_visitdnode(scn, ds, ostype,
648 cdnp, *bufp, zb->zb_blkid * epb + i, tx);
649 }
650
651 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
652 uint32_t flags = ARC_WAIT;
653 objset_phys_t *osp;
654
294f6806 655 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
572e2857 656 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
428870ff
BB
657 if (err) {
658 scn->scn_phys.scn_errors++;
659 return (err);
660 }
661
662 osp = (*bufp)->b_data;
663
428870ff
BB
664 dsl_scan_visitdnode(scn, ds, osp->os_type,
665 &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
666
667 if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
668 /*
669 * We also always visit user/group accounting
670 * objects, and never skip them, even if we are
671 * pausing. This is necessary so that the space
672 * deltas from this txg get integrated.
673 */
674 dsl_scan_visitdnode(scn, ds, osp->os_type,
675 &osp->os_groupused_dnode, *bufp,
676 DMU_GROUPUSED_OBJECT, tx);
677 dsl_scan_visitdnode(scn, ds, osp->os_type,
678 &osp->os_userused_dnode, *bufp,
679 DMU_USERUSED_OBJECT, tx);
680 }
681 }
682
683 return (0);
684}
685
10be533e 686inline __attribute__((always_inline)) static void
428870ff
BB
687dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
688 dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
689 uint64_t object, dmu_tx_t *tx)
690{
691 int j;
692
693 for (j = 0; j < dnp->dn_nblkptr; j++) {
694 zbookmark_t czb;
695
696 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
697 dnp->dn_nlevels - 1, j);
698 dsl_scan_visitbp(&dnp->dn_blkptr[j],
699 &czb, dnp, buf, ds, scn, ostype, tx);
700 }
701
702 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
703 zbookmark_t czb;
704 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
705 0, DMU_SPILL_BLKID);
706 dsl_scan_visitbp(&dnp->dn_spill,
707 &czb, dnp, buf, ds, scn, ostype, tx);
708 }
709}
710
711/*
712 * The arguments are in this order because mdb can only print the
713 * first 5; we want them to be useful.
714 */
715static void
716dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
717 dnode_phys_t *dnp, arc_buf_t *pbuf,
718 dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
719 dmu_tx_t *tx)
720{
721 dsl_pool_t *dp = scn->scn_dp;
722 arc_buf_t *buf = NULL;
161ce7ce
BB
723 blkptr_t *bp_toread;
724
20a083cb 725 bp_toread = kmem_alloc(sizeof (blkptr_t), KM_PUSHPAGE);
161ce7ce 726 *bp_toread = *bp;
428870ff
BB
727
728 /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
729
730 if (dsl_scan_check_pause(scn, zb))
161ce7ce 731 goto out;
428870ff
BB
732
733 if (dsl_scan_check_resume(scn, dnp, zb))
161ce7ce 734 goto out;
428870ff
BB
735
736 if (bp->blk_birth == 0)
161ce7ce 737 goto out;
428870ff
BB
738
739 scn->scn_visited_this_txg++;
740
b81c4ac9
BB
741 /*
742 * This debugging is commented out to conserve stack space. This
743 * function is called recursively and the debugging addes several
744 * bytes to the stack for each call. It can be commented back in
745 * if required to debug an issue in dsl_scan_visitbp().
746 *
747 * dprintf_bp(bp,
748 * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
749 * ds, ds ? ds->ds_object : 0,
750 * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
751 * pbuf, bp);
752 */
428870ff
BB
753
754 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
161ce7ce 755 goto out;
428870ff 756
161ce7ce 757 if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx,
428870ff 758 &buf) != 0)
161ce7ce 759 goto out;
428870ff
BB
760
761 /*
762 * If dsl_scan_ddt() has aready visited this block, it will have
763 * already done any translations or scrubbing, so don't call the
764 * callback again.
765 */
766 if (ddt_class_contains(dp->dp_spa,
767 scn->scn_phys.scn_ddt_class_max, bp)) {
768 ASSERT(buf == NULL);
161ce7ce 769 goto out;
428870ff
BB
770 }
771
772 /*
773 * If this block is from the future (after cur_max_txg), then we
774 * are doing this on behalf of a deleted snapshot, and we will
775 * revisit the future block on the next pass of this dataset.
776 * Don't scan it now unless we need to because something
777 * under it was modified.
778 */
779 if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
780 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
781 }
782 if (buf)
783 (void) arc_buf_remove_ref(buf, &buf);
161ce7ce
BB
784out:
785 kmem_free(bp_toread, sizeof(blkptr_t));
428870ff
BB
786}
787
788static void
789dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
790 dmu_tx_t *tx)
791{
792 zbookmark_t zb;
793
794 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
795 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
796 dsl_scan_visitbp(bp, &zb, NULL, NULL,
797 ds, scn, DMU_OST_NONE, tx);
798
799 dprintf_ds(ds, "finished scan%s", "");
800}
801
802void
803dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
804{
805 dsl_pool_t *dp = ds->ds_dir->dd_pool;
806 dsl_scan_t *scn = dp->dp_scan;
807 uint64_t mintxg;
808
809 if (scn->scn_phys.scn_state != DSS_SCANNING)
810 return;
811
812 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
813 if (dsl_dataset_is_snapshot(ds)) {
814 /* Note, scn_cur_{min,max}_txg stays the same. */
815 scn->scn_phys.scn_bookmark.zb_objset =
816 ds->ds_phys->ds_next_snap_obj;
817 zfs_dbgmsg("destroying ds %llu; currently traversing; "
818 "reset zb_objset to %llu",
819 (u_longlong_t)ds->ds_object,
820 (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
821 scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
822 } else {
823 SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
824 ZB_DESTROYED_OBJSET, 0, 0, 0);
825 zfs_dbgmsg("destroying ds %llu; currently traversing; "
826 "reset bookmark to -1,0,0,0",
827 (u_longlong_t)ds->ds_object);
828 }
829 } else if (zap_lookup_int_key(dp->dp_meta_objset,
830 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
831 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
832 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
833 scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
834 if (dsl_dataset_is_snapshot(ds)) {
835 /*
836 * We keep the same mintxg; it could be >
837 * ds_creation_txg if the previous snapshot was
838 * deleted too.
839 */
840 VERIFY(zap_add_int_key(dp->dp_meta_objset,
841 scn->scn_phys.scn_queue_obj,
842 ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
843 zfs_dbgmsg("destroying ds %llu; in queue; "
844 "replacing with %llu",
845 (u_longlong_t)ds->ds_object,
846 (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
847 } else {
848 zfs_dbgmsg("destroying ds %llu; in queue; removing",
849 (u_longlong_t)ds->ds_object);
850 }
851 } else {
852 zfs_dbgmsg("destroying ds %llu; ignoring",
853 (u_longlong_t)ds->ds_object);
854 }
855
856 /*
857 * dsl_scan_sync() should be called after this, and should sync
858 * out our changed state, but just to be safe, do it here.
859 */
860 dsl_scan_sync_state(scn, tx);
861}
862
863void
864dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
865{
866 dsl_pool_t *dp = ds->ds_dir->dd_pool;
867 dsl_scan_t *scn = dp->dp_scan;
868 uint64_t mintxg;
869
870 if (scn->scn_phys.scn_state != DSS_SCANNING)
871 return;
872
873 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
874
875 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
876 scn->scn_phys.scn_bookmark.zb_objset =
877 ds->ds_phys->ds_prev_snap_obj;
878 zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
879 "reset zb_objset to %llu",
880 (u_longlong_t)ds->ds_object,
881 (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
882 } else if (zap_lookup_int_key(dp->dp_meta_objset,
883 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
884 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
885 scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
886 VERIFY(zap_add_int_key(dp->dp_meta_objset,
887 scn->scn_phys.scn_queue_obj,
888 ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
889 zfs_dbgmsg("snapshotting ds %llu; in queue; "
890 "replacing with %llu",
891 (u_longlong_t)ds->ds_object,
892 (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
893 }
894 dsl_scan_sync_state(scn, tx);
895}
896
897void
898dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
899{
900 dsl_pool_t *dp = ds1->ds_dir->dd_pool;
901 dsl_scan_t *scn = dp->dp_scan;
902 uint64_t mintxg;
903
904 if (scn->scn_phys.scn_state != DSS_SCANNING)
905 return;
906
907 if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
908 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
909 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
910 "reset zb_objset to %llu",
911 (u_longlong_t)ds1->ds_object,
912 (u_longlong_t)ds2->ds_object);
913 } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
914 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
915 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
916 "reset zb_objset to %llu",
917 (u_longlong_t)ds2->ds_object,
918 (u_longlong_t)ds1->ds_object);
919 }
920
921 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
922 ds1->ds_object, &mintxg) == 0) {
923 int err;
924
925 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
926 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
927 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
928 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
929 err = zap_add_int_key(dp->dp_meta_objset,
930 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
931 VERIFY(err == 0 || err == EEXIST);
932 if (err == EEXIST) {
933 /* Both were there to begin with */
934 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
935 scn->scn_phys.scn_queue_obj,
936 ds1->ds_object, mintxg, tx));
937 }
938 zfs_dbgmsg("clone_swap ds %llu; in queue; "
939 "replacing with %llu",
940 (u_longlong_t)ds1->ds_object,
941 (u_longlong_t)ds2->ds_object);
942 } else if (zap_lookup_int_key(dp->dp_meta_objset,
943 scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
944 ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
945 ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
946 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
947 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
948 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
949 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
950 zfs_dbgmsg("clone_swap ds %llu; in queue; "
951 "replacing with %llu",
952 (u_longlong_t)ds2->ds_object,
953 (u_longlong_t)ds1->ds_object);
954 }
955
956 dsl_scan_sync_state(scn, tx);
957}
958
959struct enqueue_clones_arg {
960 dmu_tx_t *tx;
961 uint64_t originobj;
962};
963
964/* ARGSUSED */
965static int
13fe0198 966enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
428870ff
BB
967{
968 struct enqueue_clones_arg *eca = arg;
969 dsl_dataset_t *ds;
970 int err;
428870ff
BB
971 dsl_scan_t *scn = dp->dp_scan;
972
13fe0198
MA
973 if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
974 return (0);
975
976 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
428870ff
BB
977 if (err)
978 return (err);
979
13fe0198
MA
980 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
981 dsl_dataset_t *prev;
982 err = dsl_dataset_hold_obj(dp,
983 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
428870ff 984
13fe0198
MA
985 dsl_dataset_rele(ds, FTAG);
986 if (err)
987 return (err);
988 ds = prev;
428870ff 989 }
13fe0198
MA
990 VERIFY(zap_add_int_key(dp->dp_meta_objset,
991 scn->scn_phys.scn_queue_obj, ds->ds_object,
992 ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
428870ff
BB
993 dsl_dataset_rele(ds, FTAG);
994 return (0);
995}
996
997static void
998dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
999{
1000 dsl_pool_t *dp = scn->scn_dp;
1001 dsl_dataset_t *ds;
572e2857 1002 objset_t *os;
d6320ddb 1003 char *dsname;
428870ff
BB
1004
1005 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1006
572e2857
BB
1007 if (dmu_objset_from_ds(ds, &os))
1008 goto out;
1009
1010 /*
1011 * Only the ZIL in the head (non-snapshot) is valid. Even though
1012 * snapshots can have ZIL block pointers (which may be the same
1013 * BP as in the head), they must be ignored. So we traverse the
1014 * ZIL here, rather than in scan_recurse(), because the regular
1015 * snapshot block-sharing rules don't apply to it.
1016 */
1017 if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
1018 dsl_scan_zil(dp, &os->os_zil_header);
1019
428870ff
BB
1020 /*
1021 * Iterate over the bps in this ds.
1022 */
1023 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1024 dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
1025
20a083cb 1026 dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_PUSHPAGE);
428870ff
BB
1027 dsl_dataset_name(ds, dsname);
1028 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
1029 "pausing=%u",
1030 (longlong_t)dsobj, dsname,
1031 (longlong_t)scn->scn_phys.scn_cur_min_txg,
1032 (longlong_t)scn->scn_phys.scn_cur_max_txg,
1033 (int)scn->scn_pausing);
1034 kmem_free(dsname, ZFS_MAXNAMELEN);
1035
1036 if (scn->scn_pausing)
1037 goto out;
1038
1039 /*
1040 * We've finished this pass over this dataset.
1041 */
1042
1043 /*
1044 * If we did not completely visit this dataset, do another pass.
1045 */
1046 if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
1047 zfs_dbgmsg("incomplete pass; visiting again");
1048 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
1049 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1050 scn->scn_phys.scn_queue_obj, ds->ds_object,
1051 scn->scn_phys.scn_cur_max_txg, tx) == 0);
1052 goto out;
1053 }
1054
1055 /*
1056 * Add descendent datasets to work queue.
1057 */
1058 if (ds->ds_phys->ds_next_snap_obj != 0) {
1059 VERIFY(zap_add_int_key(dp->dp_meta_objset,
1060 scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
1061 ds->ds_phys->ds_creation_txg, tx) == 0);
1062 }
1063 if (ds->ds_phys->ds_num_children > 1) {
1064 boolean_t usenext = B_FALSE;
1065 if (ds->ds_phys->ds_next_clones_obj != 0) {
1066 uint64_t count;
1067 /*
1068 * A bug in a previous version of the code could
1069 * cause upgrade_clones_cb() to not set
1070 * ds_next_snap_obj when it should, leading to a
1071 * missing entry. Therefore we can only use the
1072 * next_clones_obj when its count is correct.
1073 */
1074 int err = zap_count(dp->dp_meta_objset,
1075 ds->ds_phys->ds_next_clones_obj, &count);
1076 if (err == 0 &&
1077 count == ds->ds_phys->ds_num_children - 1)
1078 usenext = B_TRUE;
1079 }
1080
1081 if (usenext) {
13fe0198 1082 VERIFY0(zap_join_key(dp->dp_meta_objset,
428870ff
BB
1083 ds->ds_phys->ds_next_clones_obj,
1084 scn->scn_phys.scn_queue_obj,
13fe0198 1085 ds->ds_phys->ds_creation_txg, tx));
428870ff
BB
1086 } else {
1087 struct enqueue_clones_arg eca;
1088 eca.tx = tx;
1089 eca.originobj = ds->ds_object;
1090
13fe0198
MA
1091 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1092 enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
428870ff
BB
1093 }
1094 }
1095
1096out:
1097 dsl_dataset_rele(ds, FTAG);
1098}
1099
1100/* ARGSUSED */
1101static int
13fe0198 1102enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
428870ff
BB
1103{
1104 dmu_tx_t *tx = arg;
1105 dsl_dataset_t *ds;
1106 int err;
428870ff
BB
1107 dsl_scan_t *scn = dp->dp_scan;
1108
13fe0198 1109 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
428870ff
BB
1110 if (err)
1111 return (err);
1112
1113 while (ds->ds_phys->ds_prev_snap_obj != 0) {
1114 dsl_dataset_t *prev;
1115 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
1116 FTAG, &prev);
1117 if (err) {
1118 dsl_dataset_rele(ds, FTAG);
1119 return (err);
1120 }
1121
1122 /*
1123 * If this is a clone, we don't need to worry about it for now.
1124 */
1125 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
1126 dsl_dataset_rele(ds, FTAG);
1127 dsl_dataset_rele(prev, FTAG);
1128 return (0);
1129 }
1130 dsl_dataset_rele(ds, FTAG);
1131 ds = prev;
1132 }
1133
1134 VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
1135 ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
1136 dsl_dataset_rele(ds, FTAG);
1137 return (0);
1138}
1139
1140/*
1141 * Scrub/dedup interaction.
1142 *
1143 * If there are N references to a deduped block, we don't want to scrub it
1144 * N times -- ideally, we should scrub it exactly once.
1145 *
1146 * We leverage the fact that the dde's replication class (enum ddt_class)
1147 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
1148 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
1149 *
1150 * To prevent excess scrubbing, the scrub begins by walking the DDT
1151 * to find all blocks with refcnt > 1, and scrubs each of these once.
1152 * Since there are two replication classes which contain blocks with
1153 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
1154 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
1155 *
1156 * There would be nothing more to say if a block's refcnt couldn't change
1157 * during a scrub, but of course it can so we must account for changes
1158 * in a block's replication class.
1159 *
1160 * Here's an example of what can occur:
1161 *
1162 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
1163 * when visited during the top-down scrub phase, it will be scrubbed twice.
1164 * This negates our scrub optimization, but is otherwise harmless.
1165 *
1166 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
1167 * on each visit during the top-down scrub phase, it will never be scrubbed.
1168 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
1169 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
1170 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
1171 * while a scrub is in progress, it scrubs the block right then.
1172 */
1173static void
1174dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
1175{
1176 ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
2598c001 1177 ddt_entry_t dde;
428870ff
BB
1178 int error;
1179 uint64_t n = 0;
1180
2598c001
BB
1181 bzero(&dde, sizeof (ddt_entry_t));
1182
428870ff
BB
1183 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
1184 ddt_t *ddt;
1185
1186 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
1187 break;
1188 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
1189 (longlong_t)ddb->ddb_class,
1190 (longlong_t)ddb->ddb_type,
1191 (longlong_t)ddb->ddb_checksum,
1192 (longlong_t)ddb->ddb_cursor);
1193
1194 /* There should be no pending changes to the dedup table */
1195 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
1196 ASSERT(avl_first(&ddt->ddt_tree) == NULL);
1197
1198 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
1199 n++;
1200
1201 if (dsl_scan_check_pause(scn, NULL))
1202 break;
1203 }
1204
1205 zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
1206 (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
1207 (int)scn->scn_pausing);
1208
1209 ASSERT(error == 0 || error == ENOENT);
1210 ASSERT(error != ENOENT ||
1211 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
1212}
1213
1214/* ARGSUSED */
1215void
1216dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
1217 ddt_entry_t *dde, dmu_tx_t *tx)
1218{
1219 const ddt_key_t *ddk = &dde->dde_key;
1220 ddt_phys_t *ddp = dde->dde_phys;
1221 blkptr_t bp;
1222 zbookmark_t zb = { 0 };
d6320ddb 1223 int p;
428870ff
BB
1224
1225 if (scn->scn_phys.scn_state != DSS_SCANNING)
1226 return;
1227
d6320ddb 1228 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
428870ff
BB
1229 if (ddp->ddp_phys_birth == 0 ||
1230 ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
1231 continue;
1232 ddt_bp_create(checksum, ddk, ddp, &bp);
1233
1234 scn->scn_visited_this_txg++;
1235 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
1236 }
1237}
1238
1239static void
1240dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
1241{
1242 dsl_pool_t *dp = scn->scn_dp;
40a39e11
BB
1243 zap_cursor_t *zc;
1244 zap_attribute_t *za;
428870ff
BB
1245
1246 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1247 scn->scn_phys.scn_ddt_class_max) {
1248 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1249 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1250 dsl_scan_ddt(scn, tx);
1251 if (scn->scn_pausing)
1252 return;
1253 }
1254
1255 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
1256 /* First do the MOS & ORIGIN */
1257
1258 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
1259 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1260 dsl_scan_visit_rootbp(scn, NULL,
1261 &dp->dp_meta_rootbp, tx);
1262 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
1263 if (scn->scn_pausing)
1264 return;
1265
1266 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
13fe0198
MA
1267 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1268 enqueue_cb, tx, DS_FIND_CHILDREN));
428870ff
BB
1269 } else {
1270 dsl_scan_visitds(scn,
1271 dp->dp_origin_snap->ds_object, tx);
1272 }
1273 ASSERT(!scn->scn_pausing);
1274 } else if (scn->scn_phys.scn_bookmark.zb_objset !=
1275 ZB_DESTROYED_OBJSET) {
1276 /*
1277 * If we were paused, continue from here. Note if the
1278 * ds we were paused on was deleted, the zb_objset may
1279 * be -1, so we will skip this and find a new objset
1280 * below.
1281 */
1282 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
1283 if (scn->scn_pausing)
1284 return;
1285 }
1286
1287 /*
1288 * In case we were paused right at the end of the ds, zero the
1289 * bookmark so we don't think that we're still trying to resume.
1290 */
1291 bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
20a083cb
CD
1292 zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE);
1293 za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE);
428870ff
BB
1294
1295 /* keep pulling things out of the zap-object-as-queue */
40a39e11 1296 while (zap_cursor_init(zc, dp->dp_meta_objset,
428870ff 1297 scn->scn_phys.scn_queue_obj),
40a39e11 1298 zap_cursor_retrieve(zc, za) == 0) {
428870ff
BB
1299 dsl_dataset_t *ds;
1300 uint64_t dsobj;
1301
40a39e11 1302 dsobj = strtonum(za->za_name, NULL);
428870ff
BB
1303 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
1304 scn->scn_phys.scn_queue_obj, dsobj, tx));
1305
1306 /* Set up min/max txg */
1307 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
40a39e11 1308 if (za->za_first_integer != 0) {
428870ff
BB
1309 scn->scn_phys.scn_cur_min_txg =
1310 MAX(scn->scn_phys.scn_min_txg,
40a39e11 1311 za->za_first_integer);
428870ff
BB
1312 } else {
1313 scn->scn_phys.scn_cur_min_txg =
1314 MAX(scn->scn_phys.scn_min_txg,
1315 ds->ds_phys->ds_prev_snap_txg);
1316 }
1317 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1318 dsl_dataset_rele(ds, FTAG);
1319
1320 dsl_scan_visitds(scn, dsobj, tx);
40a39e11 1321 zap_cursor_fini(zc);
428870ff 1322 if (scn->scn_pausing)
40a39e11 1323 goto out;
428870ff 1324 }
40a39e11
BB
1325 zap_cursor_fini(zc);
1326out:
1327 kmem_free(za, sizeof(zap_attribute_t));
1328 kmem_free(zc, sizeof(zap_cursor_t));
428870ff
BB
1329}
1330
9ae529ec
CS
1331static boolean_t
1332dsl_scan_free_should_pause(dsl_scan_t *scn)
428870ff 1333{
428870ff
BB
1334 uint64_t elapsed_nanosecs;
1335
1336 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
9ae529ec 1337 return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
428870ff
BB
1338 (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1339 txg_sync_waiting(scn->scn_dp)) ||
9ae529ec
CS
1340 spa_shutting_down(scn->scn_dp->dp_spa));
1341}
1342
1343static int
1344dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1345{
1346 dsl_scan_t *scn = arg;
1347
1348 if (!scn->scn_is_bptree ||
1349 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
1350 if (dsl_scan_free_should_pause(scn))
2e528b49 1351 return (SET_ERROR(ERESTART));
9ae529ec 1352 }
428870ff
BB
1353
1354 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1355 dmu_tx_get_txg(tx), bp, 0));
1356 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1357 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1358 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1359 scn->scn_visited_this_txg++;
1360 return (0);
1361}
1362
1363boolean_t
1364dsl_scan_active(dsl_scan_t *scn)
1365{
1366 spa_t *spa = scn->scn_dp->dp_spa;
1367 uint64_t used = 0, comp, uncomp;
1368
1369 if (spa->spa_load_state != SPA_LOAD_NONE)
1370 return (B_FALSE);
1371 if (spa_shutting_down(spa))
1372 return (B_FALSE);
1373
2696dfaf
GW
1374 if (scn->scn_phys.scn_state == DSS_SCANNING ||
1375 scn->scn_async_destroying)
428870ff
BB
1376 return (B_TRUE);
1377
1378 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1379 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1380 &used, &comp, &uncomp);
1381 }
1382 return (used != 0);
1383}
1384
1385void
1386dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1387{
1388 dsl_scan_t *scn = dp->dp_scan;
1389 spa_t *spa = dp->dp_spa;
1390 int err;
1391
1392 /*
1393 * Check for scn_restart_txg before checking spa_load_state, so
1394 * that we can restart an old-style scan while the pool is being
1395 * imported (see dsl_scan_init).
1396 */
1397 if (scn->scn_restart_txg != 0 &&
1398 scn->scn_restart_txg <= tx->tx_txg) {
1399 pool_scan_func_t func = POOL_SCAN_SCRUB;
1400 dsl_scan_done(scn, B_FALSE, tx);
1401 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
1402 func = POOL_SCAN_RESILVER;
1403 zfs_dbgmsg("restarting scan func=%u txg=%llu",
1404 func, tx->tx_txg);
13fe0198 1405 dsl_scan_setup_sync(&func, tx);
428870ff
BB
1406 }
1407
428870ff
BB
1408 if (!dsl_scan_active(scn) ||
1409 spa_sync_pass(dp->dp_spa) > 1)
1410 return;
1411
1412 scn->scn_visited_this_txg = 0;
1413 scn->scn_pausing = B_FALSE;
1414 scn->scn_sync_start_time = gethrtime();
1415 spa->spa_scrub_active = B_TRUE;
1416
1417 /*
1418 * First process the free list. If we pause the free, don't do
1419 * any scanning. This ensures that there is no free list when
1420 * we are scanning, so the scan code doesn't have to worry about
1421 * traversing it.
1422 */
1423 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
9ae529ec 1424 scn->scn_is_bptree = B_FALSE;
428870ff
BB
1425 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1426 NULL, ZIO_FLAG_MUSTSUCCEED);
1427 err = bpobj_iterate(&dp->dp_free_bpobj,
9ae529ec 1428 dsl_scan_free_block_cb, scn, tx);
428870ff 1429 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
9ae529ec
CS
1430
1431 if (err == 0 && spa_feature_is_active(spa,
1432 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
2696dfaf 1433 ASSERT(scn->scn_async_destroying);
9ae529ec
CS
1434 scn->scn_is_bptree = B_TRUE;
1435 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1436 NULL, ZIO_FLAG_MUSTSUCCEED);
1437 err = bptree_iterate(dp->dp_meta_objset,
1438 dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
1439 scn, tx);
13fe0198
MA
1440 VERIFY0(zio_wait(scn->scn_zio_root));
1441
1442 if (err == 0) {
1443 zfeature_info_t *feat = &spa_feature_table
1444 [SPA_FEATURE_ASYNC_DESTROY];
1445 /* finished; deactivate async destroy feature */
1446 spa_feature_decr(spa, feat, tx);
1447 ASSERT(!spa_feature_is_active(spa, feat));
1448 VERIFY0(zap_remove(dp->dp_meta_objset,
1449 DMU_POOL_DIRECTORY_OBJECT,
1450 DMU_POOL_BPTREE_OBJ, tx));
1451 VERIFY0(bptree_free(dp->dp_meta_objset,
1452 dp->dp_bptree_obj, tx));
1453 dp->dp_bptree_obj = 0;
2696dfaf 1454 scn->scn_async_destroying = B_FALSE;
13fe0198 1455 }
9ae529ec 1456 }
428870ff
BB
1457 if (scn->scn_visited_this_txg) {
1458 zfs_dbgmsg("freed %llu blocks in %llums from "
9ae529ec 1459 "free_bpobj/bptree txg %llu",
428870ff
BB
1460 (longlong_t)scn->scn_visited_this_txg,
1461 (longlong_t)
1462 (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1463 (longlong_t)tx->tx_txg);
1464 scn->scn_visited_this_txg = 0;
1465 /*
1466 * Re-sync the ddt so that we can further modify
1467 * it when doing bprewrite.
1468 */
1469 ddt_sync(spa, tx->tx_txg);
1470 }
1471 if (err == ERESTART)
1472 return;
1473 }
1474
1475 if (scn->scn_phys.scn_state != DSS_SCANNING)
1476 return;
1477
428870ff
BB
1478 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1479 scn->scn_phys.scn_ddt_class_max) {
1480 zfs_dbgmsg("doing scan sync txg %llu; "
1481 "ddt bm=%llu/%llu/%llu/%llx",
1482 (longlong_t)tx->tx_txg,
1483 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
1484 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
1485 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
1486 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
1487 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
1488 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
1489 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
1490 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
1491 } else {
1492 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
1493 (longlong_t)tx->tx_txg,
1494 (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
1495 (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
1496 (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
1497 (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
1498 }
1499
1500 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1501 NULL, ZIO_FLAG_CANFAIL);
13fe0198 1502 dsl_pool_config_enter(dp, FTAG);
428870ff 1503 dsl_scan_visit(scn, tx);
13fe0198 1504 dsl_pool_config_exit(dp, FTAG);
428870ff
BB
1505 (void) zio_wait(scn->scn_zio_root);
1506 scn->scn_zio_root = NULL;
1507
1508 zfs_dbgmsg("visited %llu blocks in %llums",
1509 (longlong_t)scn->scn_visited_this_txg,
1510 (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
1511
1512 if (!scn->scn_pausing) {
1513 /* finished with scan. */
1514 zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
1515 dsl_scan_done(scn, B_TRUE, tx);
1516 }
1517
1518 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
1519 mutex_enter(&spa->spa_scrub_lock);
1520 while (spa->spa_scrub_inflight > 0) {
1521 cv_wait(&spa->spa_scrub_io_cv,
1522 &spa->spa_scrub_lock);
1523 }
1524 mutex_exit(&spa->spa_scrub_lock);
1525 }
1526
1527 dsl_scan_sync_state(scn, tx);
1528}
1529
1530/*
1531 * This will start a new scan, or restart an existing one.
1532 */
1533void
1534dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
1535{
1536 if (txg == 0) {
1537 dmu_tx_t *tx;
1538 tx = dmu_tx_create_dd(dp->dp_mos_dir);
1539 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
1540
1541 txg = dmu_tx_get_txg(tx);
1542 dp->dp_scan->scn_restart_txg = txg;
1543 dmu_tx_commit(tx);
1544 } else {
1545 dp->dp_scan->scn_restart_txg = txg;
1546 }
1547 zfs_dbgmsg("restarting resilver txg=%llu", txg);
1548}
1549
1550boolean_t
1551dsl_scan_resilvering(dsl_pool_t *dp)
1552{
1553 return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
1554 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
1555}
1556
1557/*
1558 * scrub consumers
1559 */
1560
1561static void
1562count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1563{
1564 int i;
1565
1566 /*
1567 * If we resume after a reboot, zab will be NULL; don't record
1568 * incomplete stats in that case.
1569 */
1570 if (zab == NULL)
1571 return;
1572
1573 for (i = 0; i < 4; i++) {
1574 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1575 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
428870ff 1576 int equal;
9ae529ec
CS
1577 zfs_blkstat_t *zb;
1578
1579 if (t & DMU_OT_NEWTYPE)
1580 t = DMU_OT_OTHER;
428870ff 1581
9ae529ec 1582 zb = &zab->zab_type[l][t];
428870ff
BB
1583 zb->zb_count++;
1584 zb->zb_asize += BP_GET_ASIZE(bp);
1585 zb->zb_lsize += BP_GET_LSIZE(bp);
1586 zb->zb_psize += BP_GET_PSIZE(bp);
1587 zb->zb_gangs += BP_COUNT_GANG(bp);
1588
1589 switch (BP_GET_NDVAS(bp)) {
1590 case 2:
1591 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1592 DVA_GET_VDEV(&bp->blk_dva[1]))
1593 zb->zb_ditto_2_of_2_samevdev++;
1594 break;
1595 case 3:
1596 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1597 DVA_GET_VDEV(&bp->blk_dva[1])) +
1598 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1599 DVA_GET_VDEV(&bp->blk_dva[2])) +
1600 (DVA_GET_VDEV(&bp->blk_dva[1]) ==
1601 DVA_GET_VDEV(&bp->blk_dva[2]));
1602 if (equal == 1)
1603 zb->zb_ditto_2_of_3_samevdev++;
1604 else if (equal == 3)
1605 zb->zb_ditto_3_of_3_samevdev++;
1606 break;
1607 }
1608 }
1609}
1610
1611static void
1612dsl_scan_scrub_done(zio_t *zio)
1613{
1614 spa_t *spa = zio->io_spa;
1615
1616 zio_data_buf_free(zio->io_data, zio->io_size);
1617
1618 mutex_enter(&spa->spa_scrub_lock);
1619 spa->spa_scrub_inflight--;
1620 cv_broadcast(&spa->spa_scrub_io_cv);
1621
1622 if (zio->io_error && (zio->io_error != ECKSUM ||
1623 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
1624 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
1625 }
1626 mutex_exit(&spa->spa_scrub_lock);
1627}
1628
1629static int
1630dsl_scan_scrub_cb(dsl_pool_t *dp,
1631 const blkptr_t *bp, const zbookmark_t *zb)
1632{
1633 dsl_scan_t *scn = dp->dp_scan;
1634 size_t size = BP_GET_PSIZE(bp);
1635 spa_t *spa = dp->dp_spa;
1636 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
d6320ddb 1637 boolean_t needs_io = B_FALSE;
572e2857 1638 int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
d6320ddb 1639 int zio_priority = 0;
572e2857 1640 int scan_delay = 0;
d6320ddb 1641 int d;
428870ff
BB
1642
1643 if (phys_birth <= scn->scn_phys.scn_min_txg ||
1644 phys_birth >= scn->scn_phys.scn_max_txg)
1645 return (0);
1646
1647 count_block(dp->dp_blkstats, bp);
1648
1649 ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
1650 if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
1651 zio_flags |= ZIO_FLAG_SCRUB;
1652 zio_priority = ZIO_PRIORITY_SCRUB;
1653 needs_io = B_TRUE;
572e2857 1654 scan_delay = zfs_scrub_delay;
a117a6d6
GW
1655 } else {
1656 ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
428870ff
BB
1657 zio_flags |= ZIO_FLAG_RESILVER;
1658 zio_priority = ZIO_PRIORITY_RESILVER;
1659 needs_io = B_FALSE;
572e2857 1660 scan_delay = zfs_resilver_delay;
428870ff
BB
1661 }
1662
1663 /* If it's an intent log block, failure is expected. */
1664 if (zb->zb_level == ZB_ZIL_LEVEL)
1665 zio_flags |= ZIO_FLAG_SPECULATIVE;
1666
d6320ddb 1667 for (d = 0; d < BP_GET_NDVAS(bp); d++) {
428870ff
BB
1668 vdev_t *vd = vdev_lookup_top(spa,
1669 DVA_GET_VDEV(&bp->blk_dva[d]));
1670
1671 /*
1672 * Keep track of how much data we've examined so that
1673 * zpool(1M) status can make useful progress reports.
1674 */
1675 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
1676 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
1677
1678 /* if it's a resilver, this may not be in the target range */
1679 if (!needs_io) {
1680 if (DVA_GET_GANG(&bp->blk_dva[d])) {
1681 /*
1682 * Gang members may be spread across multiple
1683 * vdevs, so the best estimate we have is the
1684 * scrub range, which has already been checked.
1685 * XXX -- it would be better to change our
1686 * allocation policy to ensure that all
1687 * gang members reside on the same vdev.
1688 */
1689 needs_io = B_TRUE;
1690 } else {
1691 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
1692 phys_birth, 1);
1693 }
1694 }
1695 }
1696
1697 if (needs_io && !zfs_no_scrub_io) {
572e2857
BB
1698 vdev_t *rvd = spa->spa_root_vdev;
1699 uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
428870ff
BB
1700 void *data = zio_data_buf_alloc(size);
1701
1702 mutex_enter(&spa->spa_scrub_lock);
572e2857 1703 while (spa->spa_scrub_inflight >= maxinflight)
428870ff
BB
1704 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1705 spa->spa_scrub_inflight++;
1706 mutex_exit(&spa->spa_scrub_lock);
1707
572e2857
BB
1708 /*
1709 * If we're seeing recent (zfs_scan_idle) "important" I/Os
1710 * then throttle our workload to limit the impact of a scan.
1711 */
1712 if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
1713 delay(scan_delay);
1714
428870ff
BB
1715 zio_nowait(zio_read(NULL, spa, bp, data, size,
1716 dsl_scan_scrub_done, NULL, zio_priority,
1717 zio_flags, zb));
1718 }
1719
1720 /* do not relocate this block */
1721 return (0);
1722}
1723
1724int
1725dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
1726{
1727 spa_t *spa = dp->dp_spa;
1728
1729 /*
1730 * Purge all vdev caches and probe all devices. We do this here
1731 * rather than in sync context because this requires a writer lock
1732 * on the spa_config lock, which we can't do from sync context. The
1733 * spa_scrub_reopen flag indicates that vdev_open() should not
1734 * attempt to start another scrub.
1735 */
1736 spa_vdev_state_enter(spa, SCL_NONE);
1737 spa->spa_scrub_reopen = B_TRUE;
1738 vdev_reopen(spa->spa_root_vdev);
1739 spa->spa_scrub_reopen = B_FALSE;
1740 (void) spa_vdev_state_exit(spa, NULL, 0);
1741
13fe0198
MA
1742 return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
1743 dsl_scan_setup_sync, &func, 0));
428870ff 1744}
c409e464
BB
1745
1746#if defined(_KERNEL) && defined(HAVE_SPL)
1747module_param(zfs_top_maxinflight, int, 0644);
1748MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level");
1749
1750module_param(zfs_resilver_delay, int, 0644);
1751MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver");
1752
1753module_param(zfs_scrub_delay, int, 0644);
1754MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub");
1755
1756module_param(zfs_scan_idle, int, 0644);
1757MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks");
1758
1759module_param(zfs_scan_min_time_ms, int, 0644);
1760MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg");
1761
1762module_param(zfs_free_min_time_ms, int, 0644);
1763MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
1764
1765module_param(zfs_resilver_min_time_ms, int, 0644);
1766MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg");
1767
1768module_param(zfs_no_scrub_io, int, 0644);
1769MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O");
1770
1771module_param(zfs_no_scrub_prefetch, int, 0644);
1772MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching");
c409e464 1773#endif