#include <sys/vdev_removal.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
#include <sys/vdev_disk.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
dp = spa_get_dsl(spa);
dsl_pool_config_enter(dp, FTAG);
- if ((err = dsl_dataset_hold_obj(dp,
- za.za_first_integer, FTAG, &ds))) {
+ err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds);
+ if (err != 0) {
dsl_pool_config_exit(dp, FTAG);
break;
}
}
error = dmu_objset_hold(strval, FTAG, &os);
- if (error)
+ if (error != 0)
break;
/*
spa_create_zio_taskqs(spa);
}
- for (size_t i = 0; i < TXG_SIZE; i++)
- spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
*/
spa_async_suspend(spa);
+ if (spa->spa_root_vdev) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
/*
* Stop syncing.
*/
* calling taskq_wait(mg_taskq).
*/
if (spa->spa_root_vdev != NULL) {
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
if (spa->spa_mmp.mmp_thread)
}
if (spa->spa_condense_zthr != NULL) {
- ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
zthr_destroy(spa->spa_condense_zthr);
spa->spa_condense_zthr = NULL;
}
if (spa->spa_checkpoint_discard_zthr != NULL) {
- ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
zthr_destroy(spa->spa_checkpoint_discard_zthr);
spa->spa_checkpoint_discard_zthr = NULL;
}
bpobj_close(&spa->spa_deferred_bpobj);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
/*
* Close all vdevs.
spa->spa_comment = NULL;
}
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
/*
*/
spa_history_log_version(spa, "open", NULL);
+ spa_restart_removal(spa);
+ spa_spawn_aux_threads(spa);
+
/*
* Delete any inconsistent datasets.
+ *
+ * Note:
+ * Since we may be issuing deletes for clones here,
+ * we make sure to do so after we've spawned all the
+ * auxiliary threads above (from which the livelist
+ * deletion zthr is part of).
*/
(void) dmu_objset_find(spa_name(spa),
dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
*/
dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
- spa_restart_removal(spa);
-
- spa_spawn_aux_threads(spa);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
}
spa_load_note(spa, "LOADED");
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version, obj, root_dsobj = 0;
+ uint64_t version, obj;
boolean_t has_features;
boolean_t has_encryption;
spa_feature_t feat;
dmu_tx_commit(tx);
- /*
- * If the root dataset is encrypted we will need to create key mappings
- * for the zio layer before we start to write any data to disk and hold
- * them until after the first txg has been synced. Waiting for the first
- * transaction to complete also ensures that our bean counters are
- * appropriately updated.
- */
- if (dp->dp_root_dir->dd_crypto_obj != 0) {
- root_dsobj = dsl_dir_phys(dp->dp_root_dir)->dd_head_dataset_obj;
- VERIFY0(spa_keystore_create_mapping_impl(spa, root_dsobj,
- dp->dp_root_dir, FTAG));
- }
-
spa->spa_sync_on = B_TRUE;
txg_sync_start(dp);
mmp_thread_start(spa);
txg_wait_synced(dp, txg);
- if (dp->dp_root_dir->dd_crypto_obj != 0)
- VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG));
-
spa_spawn_aux_threads(spa);
spa_write_cachefile(spa, B_FALSE, B_TRUE);
return (SET_ERROR(EXDEV));
}
+ /*
+ * We're about to export or destroy this pool. Make sure
+ * we stop all initializtion activity here before we
+ * set the spa_final_txg. This will ensure that all
+ * dirty data resulting from the initialization is
+ * committed to disk before we unload the pool.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
/*
* We want this to be reflected on every label,
* so mark them all dirty. spa_unload() will do the
/*
* Schedule the resilver to restart in the future. We do this to
* ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
+ * respective datasets. We do not do this if resilvers have been
+ * deferred.
*/
- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, newvd);
+ else
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
return (error);
}
+static int
+spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+ list_t *vd_list)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ return (SET_ERROR(EROFS));
+ }
+ mutex_enter(&vd->vdev_initialize_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate an initialize action we check to see
+ * if the vdev_initialize_thread is NULL. We do this instead
+ * of using the vdev_initialize_state since there might be
+ * a previous initialization process which has completed but
+ * the thread is not exited.
+ */
+ if (cmd_type == POOL_INITIALIZE_DO &&
+ (vd->vdev_initialize_thread != NULL ||
+ vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+ (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_INITIALIZE_DO:
+ vdev_initialize(vd);
+ break;
+ case POOL_INITIALIZE_CANCEL:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ return (0);
+}
+
+int
+spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
+ nvlist_t *vdev_errlist)
+{
+ int total_errors = 0;
+ list_t vd_list;
+
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+
+ for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
+ &vd_list);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+
+ /* Wait for all initialize threads to stop. */
+ vdev_initialize_stop_wait(spa, &vd_list);
+
+ /* Sync out the initializing state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ list_destroy(&vd_list);
+
+ return (total_errors);
+}
+
/*
* Split a set of devices from their mirrors, and create a new pool from them.
*/
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
+ /*
+ * Temporarily stop the initializing activity. We set the state to
+ * ACTIVE so that we know to resume the initializing once the split
+ * has completed.
+ */
+ list_t vd_list;
+ list_create(&vd_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_initialize_node));
+
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ mutex_enter(&vml[c]->vdev_initialize_lock);
+ vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE,
+ &vd_list);
+ mutex_exit(&vml[c]->vdev_initialize_lock);
+ }
+ }
+ vdev_initialize_stop_wait(spa, &vd_list);
+ list_destroy(&vd_list);
+
newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
/* create the new pool from the disks of the original pool */
if (vml[c] != NULL)
vml[c]->vdev_offline = B_FALSE;
}
+
+ /* restart initializing disks as necessary */
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
vdev_reopen(spa->spa_root_vdev);
nvlist_free(spa->spa_config_splitting);
spa_async_thread(void *arg)
{
spa_t *spa = (spa_t *)arg;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
int tasks;
ASSERT(spa->spa_sync_on);
/*
* Kick off a resilver.
*/
- if (tasks & SPA_ASYNC_RESILVER)
- dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ if (tasks & SPA_ASYNC_RESILVER &&
+ (!dsl_scan_resilvering(dp) ||
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
+ dsl_resilver_restart(dp, 0);
+
+ if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
/*
* Let the world know that we're done.
spa_vdev_remove_suspend(spa);
zthr_t *condense_thread = spa->spa_condense_zthr;
- if (condense_thread != NULL && zthr_isrunning(condense_thread))
- VERIFY0(zthr_cancel(condense_thread));
+ if (condense_thread != NULL)
+ zthr_cancel(condense_thread);
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
- if (discard_thread != NULL && zthr_isrunning(discard_thread))
- VERIFY0(zthr_cancel(discard_thread));
+ if (discard_thread != NULL)
+ zthr_cancel(discard_thread);
}
void
spa_restart_removal(spa);
zthr_t *condense_thread = spa->spa_condense_zthr;
- if (condense_thread != NULL && !zthr_isrunning(condense_thread))
+ if (condense_thread != NULL)
zthr_resume(condense_thread);
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
- if (discard_thread != NULL && !zthr_isrunning(discard_thread))
+ if (discard_thread != NULL)
zthr_resume(discard_thread);
}
static void
spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
{
+ if (spa_sync_pass(spa) != 1)
+ return;
+
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
spa_free_sync_cb, zio, tx), ==, 0);
static void
spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
{
- dsl_pool_t *dp = spa->spa_dsl_pool;
-
- ASSERT(spa->spa_sync_pass == 1);
+ if (spa_sync_pass(spa) != 1)
+ return;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
ASSERT(vib != NULL);
}
- if (vdev_obsolete_sm_object(vd) != 0) {
+ uint64_t obsolete_sm_object = 0;
+ ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
ASSERT(vd->vdev_obsolete_sm != NULL);
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
-
- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ ASSERT3U(obsolete_sm_object, ==,
space_map_object(vd->vdev_obsolete_sm));
ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
space_map_allocated(vd->vdev_obsolete_sm));
}
/*
- * Sync the specified transaction group. New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
*/
-void
-spa_sync(spa_t *spa, uint64_t txg)
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
{
- dsl_pool_t *dp = spa->spa_dsl_pool;
- objset_t *mos = spa->spa_meta_objset;
- bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
- metaslab_class_t *normal = spa_normal_class(spa);
- metaslab_class_t *special = spa_special_class(spa);
- metaslab_class_t *dedup = spa_dedup_class(spa);
+ ASSERT(spa_writeable(spa));
+
vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd;
- dmu_tx_t *tx;
- int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
- VERIFY(spa_writeable(spa));
-
- /*
- * Wait for i/os issued in open context that need to complete
- * before this txg syncs.
- */
- VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
-
- /*
- * Lock out configuration changes.
- */
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
- spa->spa_syncing_txg = txg;
- spa->spa_sync_pass = 0;
-
- for (int i = 0; i < spa->spa_alloc_count; i++) {
- mutex_enter(&spa->spa_alloc_locks[i]);
- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
- mutex_exit(&spa->spa_alloc_locks[i]);
- }
-
- /*
- * If there are any pending vdev state changes, convert them
- * into config changes that go out with this transaction group.
- */
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
- while (list_head(&spa->spa_state_dirty_list) != NULL) {
- /*
- * We need the write lock here because, for aux vdevs,
- * calling vdev_config_dirty() modifies sav_config.
- * This is ugly and will become unnecessary when we
- * eliminate the aux vdev wart by integrating all vdevs
- * into the root vdev tree.
- */
- spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
- while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
- vdev_state_clean(vd);
- vdev_config_dirty(vd);
- }
- spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
- }
- spa_config_exit(spa, SCL_STATE, FTAG);
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- spa->spa_sync_starttime = gethrtime();
- taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
- spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
- spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
- NSEC_TO_TICK(spa->spa_deadman_synctime));
-
- /*
- * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
- * set spa_deflate if we have no raid-z vdevs.
- */
- if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
- int i;
-
- for (i = 0; i < rvd->vdev_children; i++) {
- vd = rvd->vdev_child[i];
- if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
- break;
- }
- if (i == rvd->vdev_children) {
- spa->spa_deflate = TRUE;
- VERIFY(0 == zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate, tx));
- }
- }
-
- /*
- * Set the top-level vdev's max queue depth. Evaluate each
- * top-level's async write queue depth in case it changed.
- * The max queue depth will not change in the middle of syncing
- * out this txg.
- */
uint64_t slots_per_allocator = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- metaslab_group_t *mg = tvd->vdev_mg;
- metaslab_class_t *mc;
+ metaslab_group_t *mg = tvd->vdev_mg;
if (mg == NULL || !metaslab_group_initialized(mg))
continue;
- mc = mg->mg_class;
+ metaslab_class_t *mc = mg->mg_class;
if (mc != normal && mc != special && mc != dedup)
continue;
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+ ASSERT(spa_writeable(spa));
+
+ vdev_t *rvd = spa->spa_root_vdev;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_indirect_state_sync_verify(vd);
break;
}
}
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ uint64_t txg = tx->tx_txg;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
- /*
- * Iterate to convergence.
- */
do {
int pass = ++spa->spa_sync_pass;
ddt_sync(spa, txg);
dsl_scan_sync(dp, tx);
+ svr_sync(spa, tx);
+ spa_sync_upgrades(spa, tx);
- if (spa->spa_vdev_removal != NULL)
- svr_sync(spa, tx);
-
+ vdev_t *vd = NULL;
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
!= NULL)
vdev_sync(vd, txg);
- if (pass == 1) {
- spa_sync_upgrades(spa, tx);
- ASSERT3U(txg, >=,
- spa->spa_uberblock.ub_rootbp.blk_birth);
+ /*
+ * Note: We need to check if the MOS is dirty because we could
+ * have marked the MOS dirty without updating the uberblock
+ * (e.g. if we have sync tasks but no dirty user data). We need
+ * to check the uberblock's rootbp because it is updated if we
+ * have synced out dirty data (though in this case the MOS will
+ * most likely also be dirty due to second order effects, we
+ * don't want to rely on that here).
+ */
+ if (pass == 1 &&
+ spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(mos, txg)) {
/*
- * Note: We need to check if the MOS is dirty
- * because we could have marked the MOS dirty
- * without updating the uberblock (e.g. if we
- * have sync tasks but no dirty user data). We
- * need to check the uberblock's rootbp because
- * it is updated if we have synced out dirty
- * data (though in this case the MOS will most
- * likely also be dirty due to second order
- * effects, we don't want to rely on that here).
+ * Nothing changed on the first pass, therefore this
+ * TXG is a no-op. Avoid syncing deferred frees, so
+ * that we can keep this TXG as a no-op.
*/
- if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
- !dmu_objset_is_dirty(mos, txg)) {
- /*
- * Nothing changed on the first pass,
- * therefore this TXG is a no-op. Avoid
- * syncing deferred frees, so that we
- * can keep this TXG as a no-op.
- */
- ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
- txg));
- ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
- ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
- ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
- txg));
- break;
- }
- spa_sync_deferred_frees(spa, tx);
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+ break;
}
+ spa_sync_deferred_frees(spa, tx);
} while (dmu_objset_is_dirty(mos, txg));
+}
-#ifdef ZFS_DEBUG
- if (!list_is_empty(&spa->spa_config_dirty_list)) {
- /*
- * Make sure that the number of ZAPs for all the vdevs matches
- * the number of ZAPs in the per-vdev ZAP list. This only gets
- * called if the config is dirty; otherwise there may be
- * outstanding AVZ operations that weren't completed in
- * spa_sync_config_object.
- */
- uint64_t all_vdev_zap_entry_count;
- ASSERT0(zap_count(spa->spa_meta_objset,
- spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
- ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
- all_vdev_zap_entry_count);
- }
-#endif
-
- if (spa->spa_vdev_removal != NULL) {
- ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
- }
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg = tx->tx_txg;
- /*
- * Rewrite the vdev configuration (which includes the uberblock)
- * to commit the transaction group.
- *
- * If there are no dirty vdevs, we sync the uberblock to a few
- * random top-level vdevs that are known to be visible in the
- * config cache (see spa_vdev_add() for a complete description).
- * If there *are* dirty vdevs, sync the uberblock to all vdevs.
- */
for (;;) {
+ int error = 0;
+
/*
* We hold SCL_STATE to prevent vdev open/close/etc.
* while we're attempting to write the vdev labels.
int c0 = spa_get_random(children);
for (int c = 0; c < children; c++) {
- vd = rvd->vdev_child[(c0 + c) % children];
+ vdev_t *vd =
+ rvd->vdev_child[(c0 + c) % children];
/* Stop when revisiting the first vdev */
if (c > 0 && svd[0] == vd)
break;
- if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ if (vd->vdev_ms_array == 0 ||
+ vd->vdev_islog ||
!vdev_is_concrete(vd))
continue;
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
zio_resume_wait(spa);
}
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ vdev_t *vd = NULL;
+
+ VERIFY(spa_writeable(spa));
+
+ /*
+ * Wait for i/os issued in open context that need to complete
+ * before this txg syncs.
+ */
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * If there are any pending vdev state changes, convert them
+ * into config changes that go out with this transaction group.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ /*
+ * We need the write lock here because, for aux vdevs,
+ * calling vdev_config_dirty() modifies sav_config.
+ * This is ugly and will become unnecessary when we
+ * eliminate the aux vdev wart by integrating all vdevs
+ * into the root vdev tree.
+ */
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ }
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+ spa->spa_sync_starttime = gethrtime();
+ taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+ spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+ spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+ NSEC_TO_TICK(spa->spa_deadman_synctime));
+
+ /*
+ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ int i;
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ spa_sync_adjust_vdev_max_queue_depth(spa);
+
+ spa_sync_condense_indirect(spa, tx);
+
+ spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+ if (!list_is_empty(&spa->spa_config_dirty_list)) {
+ /*
+ * Make sure that the number of ZAPs for all the vdevs matches
+ * the number of ZAPs in the per-vdev ZAP list. This only gets
+ * called if the config is dirty; otherwise there may be
+ * outstanding AVZ operations that weren't completed in
+ * spa_sync_config_object.
+ */
+ uint64_t all_vdev_zap_entry_count;
+ ASSERT0(zap_count(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+ ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+ all_vdev_zap_entry_count);
+ }
+#endif
+
+ if (spa->spa_vdev_removal != NULL) {
+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+ }
+
+ spa_sync_rewrite_vdev_config(spa, tx);
dmu_tx_commit(tx);
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
/*
* Update usable space statistics.
*/
- while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))))
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ != NULL)
vdev_sync_done(vd, txg);
spa_update_dspace(spa);