From: Serapheim Dimitropoulos Date: Fri, 18 Jan 2019 17:50:16 +0000 (-0800) Subject: Simplify spa_sync by breaking it up to smaller functions X-Git-Tag: zfs-0.8.0~217 X-Git-Url: https://git.proxmox.com/?p=mirror_zfs.git;a=commitdiff_plain;h=8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834 Simplify spa_sync by breaking it up to smaller functions The point of this refactoring is to break the high-level conceptual steps of spa_sync() to their own helper functions. In general large functions can enhance readability if structured well, but in this case the amount of conceptual steps taken could use the help of helper functions. Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Serapheim Dimitropoulos Closes #8293 --- diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c98daab49..bbe2f8962 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7323,6 +7323,9 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { + if (spa_sync_pass(spa) != 1) + return; + zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); @@ -7718,10 +7721,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx) static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { - dsl_pool_t *dp = spa->spa_dsl_pool; - - ASSERT(spa->spa_sync_pass == 1); + if (spa_sync_pass(spa) != 1) + return; + dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && @@ -7817,118 +7820,31 @@ vdev_indirect_state_sync_verify(vdev_t *vd) } /* - * Sync the specified transaction group. New blocks may be dirtied as - * part of the process, so we iterate until it converges. + * Set the top-level vdev's max queue depth. Evaluate each top-level's + * async write queue depth in case it changed. The max queue depth will + * not change in the middle of syncing out this txg. */ -void -spa_sync(spa_t *spa, uint64_t txg) +static void +spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { - dsl_pool_t *dp = spa->spa_dsl_pool; - objset_t *mos = spa->spa_meta_objset; - bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; - metaslab_class_t *normal = spa_normal_class(spa); - metaslab_class_t *special = spa_special_class(spa); - metaslab_class_t *dedup = spa_dedup_class(spa); + ASSERT(spa_writeable(spa)); + vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd; - dmu_tx_t *tx; - int error; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; + metaslab_class_t *normal = spa_normal_class(spa); + metaslab_class_t *special = spa_special_class(spa); + metaslab_class_t *dedup = spa_dedup_class(spa); - VERIFY(spa_writeable(spa)); - - /* - * Wait for i/os issued in open context that need to complete - * before this txg syncs. - */ - (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); - spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL); - - /* - * Lock out configuration changes. - */ - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - spa->spa_syncing_txg = txg; - spa->spa_sync_pass = 0; - - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); - } - - /* - * If there are any pending vdev state changes, convert them - * into config changes that go out with this transaction group. - */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while (list_head(&spa->spa_state_dirty_list) != NULL) { - /* - * We need the write lock here because, for aux vdevs, - * calling vdev_config_dirty() modifies sav_config. - * This is ugly and will become unnecessary when we - * eliminate the aux vdev wart by integrating all vdevs - * into the root vdev tree. - */ - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); - while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { - vdev_state_clean(vd); - vdev_config_dirty(vd); - } - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); - } - spa_config_exit(spa, SCL_STATE, FTAG); - - tx = dmu_tx_create_assigned(dp, txg); - - spa->spa_sync_starttime = gethrtime(); - taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, - spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + - NSEC_TO_TICK(spa->spa_deadman_synctime)); - - /* - * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, - * set spa_deflate if we have no raid-z vdevs. - */ - if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && - spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { - int i; - - for (i = 0; i < rvd->vdev_children; i++) { - vd = rvd->vdev_child[i]; - if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) - break; - } - if (i == rvd->vdev_children) { - spa->spa_deflate = TRUE; - VERIFY(0 == zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx)); - } - } - - /* - * Set the top-level vdev's max queue depth. Evaluate each - * top-level's async write queue depth in case it changed. - * The max queue depth will not change in the middle of syncing - * out this txg. - */ uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - metaslab_class_t *mc; + metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || !metaslab_group_initialized(mg)) continue; - mc = mg->mg_class; + metaslab_class_t *mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; @@ -7960,7 +7876,14 @@ spa_sync(spa_t *spa, uint64_t txg) normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; +} + +static void +spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) +{ + ASSERT(spa_writeable(spa)); + vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); @@ -7970,10 +7893,16 @@ spa_sync(spa_t *spa, uint64_t txg) break; } } +} + +static void +spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) +{ + objset_t *mos = spa->spa_meta_objset; + dsl_pool_t *dp = spa->spa_dsl_pool; + uint64_t txg = tx->tx_txg; + bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; - /* - * Iterate to convergence. - */ do { int pass = ++spa->spa_sync_pass; @@ -7999,81 +7928,60 @@ spa_sync(spa_t *spa, uint64_t txg) ddt_sync(spa, txg); dsl_scan_sync(dp, tx); + svr_sync(spa, tx); + spa_sync_upgrades(spa, tx); - if (spa->spa_vdev_removal != NULL) - svr_sync(spa, tx); - + vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); - if (pass == 1) { - spa_sync_upgrades(spa, tx); - ASSERT3U(txg, >=, - spa->spa_uberblock.ub_rootbp.blk_birth); + /* + * Note: We need to check if the MOS is dirty because we could + * have marked the MOS dirty without updating the uberblock + * (e.g. if we have sync tasks but no dirty user data). We need + * to check the uberblock's rootbp because it is updated if we + * have synced out dirty data (though in this case the MOS will + * most likely also be dirty due to second order effects, we + * don't want to rely on that here). + */ + if (pass == 1 && + spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(mos, txg)) { /* - * Note: We need to check if the MOS is dirty - * because we could have marked the MOS dirty - * without updating the uberblock (e.g. if we - * have sync tasks but no dirty user data). We - * need to check the uberblock's rootbp because - * it is updated if we have synced out dirty - * data (though in this case the MOS will most - * likely also be dirty due to second order - * effects, we don't want to rely on that here). + * Nothing changed on the first pass, therefore this + * TXG is a no-op. Avoid syncing deferred frees, so + * that we can keep this TXG as a no-op. */ - if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && - !dmu_objset_is_dirty(mos, txg)) { - /* - * Nothing changed on the first pass, - * therefore this TXG is a no-op. Avoid - * syncing deferred frees, so that we - * can keep this TXG as a no-op. - */ - ASSERT(txg_list_empty(&dp->dp_dirty_datasets, - txg)); - ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); - ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); - ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, - txg)); - break; - } - spa_sync_deferred_frees(spa, tx); + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); + break; } + spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); +} -#ifdef ZFS_DEBUG - if (!list_is_empty(&spa->spa_config_dirty_list)) { - /* - * Make sure that the number of ZAPs for all the vdevs matches - * the number of ZAPs in the per-vdev ZAP list. This only gets - * called if the config is dirty; otherwise there may be - * outstanding AVZ operations that weren't completed in - * spa_sync_config_object. - */ - uint64_t all_vdev_zap_entry_count; - ASSERT0(zap_count(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); - ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, - all_vdev_zap_entry_count); - } -#endif - - if (spa->spa_vdev_removal != NULL) { - ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); - } +/* + * Rewrite the vdev configuration (which includes the uberblock) to + * commit the transaction group. + * + * If there are no dirty vdevs, we sync the uberblock to a few random + * top-level vdevs that are known to be visible in the config cache + * (see spa_vdev_add() for a complete description). If there *are* dirty + * vdevs, sync the uberblock to all vdevs. + */ +static void +spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg = tx->tx_txg; - /* - * Rewrite the vdev configuration (which includes the uberblock) - * to commit the transaction group. - * - * If there are no dirty vdevs, we sync the uberblock to a few - * random top-level vdevs that are known to be visible in the - * config cache (see spa_vdev_add() for a complete description). - * If there *are* dirty vdevs, sync the uberblock to all vdevs. - */ for (;;) { + int error = 0; + /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. @@ -8087,13 +7995,15 @@ spa_sync(spa_t *spa, uint64_t txg) int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { - vd = rvd->vdev_child[(c0 + c) % children]; + vdev_t *vd = + rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; - if (vd->vdev_ms_array == 0 || vd->vdev_islog || + if (vd->vdev_ms_array == 0 || + vd->vdev_islog || !vdev_is_concrete(vd)) continue; @@ -8117,6 +8027,124 @@ spa_sync(spa_t *spa, uint64_t txg) zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + vdev_t *vd = NULL; + + VERIFY(spa_writeable(spa)); + + /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } + + /* + * If there are any pending vdev state changes, convert them + * into config changes that go out with this transaction group. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + spa_config_exit(spa, SCL_STATE, FTAG); + + dsl_pool_t *dp = spa->spa_dsl_pool; + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + + spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + NSEC_TO_TICK(spa->spa_deadman_synctime)); + + /* + * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { + vdev_t *rvd = spa->spa_root_vdev; + + int i; + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + + spa_sync_adjust_vdev_max_queue_depth(spa); + + spa_sync_condense_indirect(spa, tx); + + spa_sync_iterate_to_convergence(spa, tx); + +#ifdef ZFS_DEBUG + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } +#endif + + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + + spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 1896e824f..8d8900787 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1798,6 +1798,9 @@ svr_sync(spa_t *spa, dmu_tx_t *tx) spa_vdev_removal_t *svr = spa->spa_vdev_removal; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + if (svr == NULL) + return; + /* * This check is necessary so that we do not dirty the * DIRECTORY_OBJECT via spa_sync_removing_state() when there