/* max number of blocks to free in a single TXG */
unsigned long zfs_async_block_max_blocks = 100000;
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
/*
* We wait a few txgs after importing a pool to begin scanning so that
* the import / mounting code isn't held up by scrub / resilver IO.
/* private data for dsl_scan_prefetch_cb() */
typedef struct scan_prefetch_ctx {
- refcount_t spc_refcnt; /* refcount for memory management */
+ zfs_refcount_t spc_refcnt; /* refcount for memory management */
dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */
boolean_t spc_root; /* is this prefetch for an objset? */
uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
- bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
offsetof(scan_ds_t, sds_node));
avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
}
}
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
/* reload the queue into the in-core state */
if (scn->scn_phys.scn_queue_obj != 0) {
zap_cursor_t zc;
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
+ if (func == POOL_SCAN_RESILVER) {
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ return (0);
+ }
+
if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
}
return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
- dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+ dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+static boolean_t
+dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+ boolean_t resilver_needed = B_FALSE;
+ spa_t *spa = vd->vdev_spa;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ resilver_needed |=
+ dsl_scan_clear_deferred(vd->vdev_child[c], tx);
+ }
+
+ if (vd == spa->spa_root_vdev &&
+ spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+ vdev_config_dirty(vd);
+ spa->spa_resilver_deferred = B_FALSE;
+ return (resilver_needed);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (resilver_needed);
+
+ if (vd->vdev_resilver_deferred)
+ vd->vdev_resilver_deferred = B_FALSE;
+
+ return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+ vdev_resilver_needed(vd, NULL, NULL));
}
/* ARGSUSED */
* If the scrub/resilver completed, update all DTLs to
* reflect this. Whether it succeeded or not, vacate
* all temporary scrub DTLs.
+ *
+ * As the scrub does not currently support traversing
+ * data that have been freed but are part of a checkpoint,
+ * we don't mark the scrub as done in the DTLs as faults
+ * may still exist in those vdevs.
*/
- vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
- complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
- if (complete) {
+ if (complete &&
+ !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ scn->scn_phys.scn_max_txg, B_TRUE);
+
spa_event_notify(spa, NULL, NULL,
scn->scn_phys.scn_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ } else {
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ 0, B_TRUE);
}
spa_errlog_rotate(spa);
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+ /*
+ * Clear any deferred_resilver flags in the config.
+ * If there are drives that need resilvering, kick
+ * off an asynchronous request to start resilver.
+ * dsl_scan_clear_deferred() may update the config
+ * before the resilver can restart. In the event of
+ * a crash during this period, the spa loading code
+ * will find the drives that need to be resilvered
+ * when the machine reboots and start the resilver then.
+ */
+ boolean_t resilver_needed =
+ dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
+ if (resilver_needed) {
+ spa_history_log_internal(spa,
+ "starting deferred resilver", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
}
scn->scn_phys.scn_end_time = gethrestime_sec();
/* can't pause a scrub when there is no in-progress scrub */
spa->spa_scan_pass_scrub_pause = gethrestime_sec();
scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx, SYNC_CACHED);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
} else {
gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
spa->spa_scan_pass_scrub_pause = 0;
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
}
* (on-disk) even if it hasn't been claimed (even though for
* scrub there's nothing to do to it).
*/
- if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
zil_scan_arg_t zsa = { dp, zh };
zilog_t *zilog;
+ ASSERT(spa_writeable(dp->dp_spa));
+
/*
* We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed).
*/
- if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ if (claim_txg == 0)
return;
zilog = zil_alloc(dp->dp_meta_objset, zh);
static void
scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
{
- if (refcount_remove(&spc->spc_refcnt, tag) == 0) {
- refcount_destroy(&spc->spc_refcnt);
+ if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
+ zfs_refcount_destroy(&spc->spc_refcnt);
kmem_free(spc, sizeof (scan_prefetch_ctx_t));
}
}
scan_prefetch_ctx_t *spc;
spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
- refcount_create(&spc->spc_refcnt);
- refcount_add(&spc->spc_refcnt, tag);
+ zfs_refcount_create(&spc->spc_refcnt);
+ zfs_refcount_add(&spc->spc_refcnt, tag);
spc->spc_scn = scn;
if (dnp != NULL) {
spc->spc_datablkszsec = dnp->dn_datablkszsec;
static void
scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
{
- refcount_add(&spc->spc_refcnt, tag);
+ zfs_refcount_add(&spc->spc_refcnt, tag);
}
static boolean_t
if (!dsl_scan_is_running(scn))
return;
+ /*
+ * This function is special because it is the only thing
+ * that can add scan_io_t's to the vdev scan queues from
+ * outside dsl_scan_sync(). For the most part this is ok
+ * as long as it is called from within syncing context.
+ * However, dsl_scan_sync() expects that no new sio's will
+ * be added between when all the work for a scan is done
+ * and the next txg when the scan is actually marked as
+ * completed. This check ensures we do not issue new sio's
+ * during this period.
+ */
+ if (scn->scn_done_txg != 0)
+ return;
+
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
return (used != 0);
}
+static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+ boolean_t need_resilver = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ need_resilver |=
+ dsl_scan_check_deferred(vd->vdev_child[c]);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (need_resilver);
+
+ if (!vd->vdev_resilver_deferred)
+ need_resilver = B_TRUE;
+
+ return (need_resilver);
+}
+
static boolean_t
dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
uint64_t phys_birth)
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
return (B_FALSE);
+ /*
+ * Check that this top-level vdev has a device under it which
+ * is resilvering and is not deferred.
+ */
+ if (!dsl_scan_check_deferred(vd))
+ return (B_FALSE);
+
return (B_TRUE);
}
-/*
- * This is the primary entry point for scans that is called from syncing
- * context. Scans must happen entirely during syncing context so that we
- * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this function controls how quickly
- * transaction groups proceed, instead of the normal handling provided by
- * txg_sync_thread().
- */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
{
- int err = 0;
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- state_sync_type_t sync_type = SYNC_OPTIONAL;
-
- /*
- * Check for scn_restart_txg before checking spa_load_state, so
- * that we can restart an old-style scan while the pool is being
- * imported (see dsl_scan_init).
- */
- if (dsl_scan_restarting(scn, tx)) {
- pool_scan_func_t func = POOL_SCAN_SCRUB;
- dsl_scan_done(scn, B_FALSE, tx);
- if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
- func = POOL_SCAN_RESILVER;
- zfs_dbgmsg("restarting scan func=%u txg=%llu",
- func, (longlong_t)tx->tx_txg);
- dsl_scan_setup_sync(&func, tx);
- }
-
- /*
- * Only process scans in sync pass 1.
- */
- if (spa_sync_pass(spa) > 1)
- return;
-
- /*
- * If the spa is shutting down, then stop scanning. This will
- * ensure that the scan does not dirty any new data during the
- * shutdown phase.
- */
- if (spa_shutting_down(spa))
- return;
-
- /*
- * If the scan is inactive due to a stalled async destroy, try again.
- */
- if (!scn->scn_async_stalled && !dsl_scan_active(scn))
- return;
+ int err = 0;
- /* reset scan statistics */
- scn->scn_visited_this_txg = 0;
- scn->scn_holes_this_txg = 0;
- scn->scn_lt_min_this_txg = 0;
- scn->scn_gt_max_this_txg = 0;
- scn->scn_ddt_contained_this_txg = 0;
- scn->scn_objsets_visited_this_txg = 0;
- scn->scn_avg_seg_size_this_txg = 0;
- scn->scn_segs_this_txg = 0;
- scn->scn_avg_zio_size_this_txg = 0;
- scn->scn_zios_this_txg = 0;
- scn->scn_suspending = B_FALSE;
- scn->scn_sync_start_time = gethrtime();
- spa->spa_scrub_active = B_TRUE;
+ if (spa_suspend_async_destroy(spa))
+ return (0);
- /*
- * First process the async destroys. If we suspend, don't do
- * any scrubbing or resilvering. This ensures that there are no
- * async destroys while we are scanning, so the scan code doesn't
- * have to worry about traversing it. It is also faster to free the
- * blocks than to scrub them.
- */
if (zfs_free_bpobj_enabled &&
spa_version(spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
ddt_sync(spa, tx->tx_txg);
}
if (err != 0)
- return;
+ return (err);
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
zfs_free_leak_on_eio &&
(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
dsl_pool_destroy_obsolete_bpobj(dp, tx);
}
+ return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ int err = 0;
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+ if (spa->spa_resilver_deferred &&
+ !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+ spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init). We also restart scans if there
+ * is a deferred resilver and the user has manually disabled
+ * deferred resilvers via the tunable.
+ */
+ if (dsl_scan_restarting(scn, tx) ||
+ (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, (longlong_t)tx->tx_txg);
+ dsl_scan_setup_sync(&func, tx);
+ }
+
+ /*
+ * Only process scans in sync pass 1.
+ */
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * If the spa is shutting down, then stop scanning. This will
+ * ensure that the scan does not dirty any new data during the
+ * shutdown phase.
+ */
+ if (spa_shutting_down(spa))
+ return;
+
+ /*
+ * If the scan is inactive due to a stalled async destroy, try again.
+ */
+ if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+ return;
+
+ /* reset scan statistics */
+ scn->scn_visited_this_txg = 0;
+ scn->scn_holes_this_txg = 0;
+ scn->scn_lt_min_this_txg = 0;
+ scn->scn_gt_max_this_txg = 0;
+ scn->scn_ddt_contained_this_txg = 0;
+ scn->scn_objsets_visited_this_txg = 0;
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ scn->scn_suspending = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the async destroys. If we suspend, don't do
+ * any scrubbing or resilvering. This ensures that there are no
+ * async destroys while we are scanning, so the scan code doesn't
+ * have to worry about traversing it. It is also faster to free the
+ * blocks than to scrub them.
+ */
+ err = dsl_process_async_destroys(dp, tx);
+ if (err != 0)
+ return;
if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
return;
(longlong_t)tx->tx_txg);
}
} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+ ASSERT(scn->scn_clearing);
+
/* need to issue scrubbing IOs from per-vdev queues */
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_CANFAIL);
boolean_t needs_io = B_FALSE;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
- if (phys_birth <= scn->scn_phys.scn_min_txg ||
- phys_birth >= scn->scn_phys.scn_max_txg)
- return (0);
- if (BP_IS_EMBEDDED(bp)) {
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg) {
count_block(scn, dp->dp_blkstats, bp);
return (0);
}
+ /* Embedded BP's have phys_birth==0, so we reject them above. */
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
dsl_scan_freed_dva(spa, bp, i);
}
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
/* CSTYLED */
module_param(zfs_scan_vdev_limit, ulong, 0644);
MODULE_PARM_DESC(zfs_scan_vdev_limit,
module_param(zfs_scan_fill_weight, int, 0644);
MODULE_PARM_DESC(zfs_scan_fill_weight,
"Tunable to adjust bias towards more filled segments during scans");
+
+module_param(zfs_resilver_disable_defer, int, 0644);
+MODULE_PARM_DESC(zfs_resilver_disable_defer,
+ "Process all resilvers immediately");
#endif