]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Simplify spa_sync by breaking it up to smaller functions
authorSerapheim Dimitropoulos <serapheimd@gmail.com>
Fri, 18 Jan 2019 17:50:16 +0000 (09:50 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 18 Jan 2019 17:50:16 +0000 (09:50 -0800)
The point of this refactoring is to break the high-level conceptual
steps of spa_sync() to their own helper functions. In general large
functions can enhance readability if structured well, but in this
case the amount of conceptual steps taken could use the help of
helper functions.

Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8293

module/zfs/spa.c
module/zfs/vdev_removal.c

index c98daab49af1da9b16667a34e704fc3ee76ba5b4..bbe2f89629a5ab1a905522dbf396e53728f3e8a9 100644 (file)
@@ -7323,6 +7323,9 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
+       if (spa_sync_pass(spa) != 1)
+               return;
+
        zio_t *zio = zio_root(spa, NULL, NULL, 0);
        VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
            spa_free_sync_cb, zio, tx), ==, 0);
        zio_t *zio = zio_root(spa, NULL, NULL, 0);
        VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
            spa_free_sync_cb, zio, tx), ==, 0);
@@ -7718,10 +7721,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
-       dsl_pool_t *dp = spa->spa_dsl_pool;
-
-       ASSERT(spa->spa_sync_pass == 1);
+       if (spa_sync_pass(spa) != 1)
+               return;
 
 
+       dsl_pool_t *dp = spa->spa_dsl_pool;
        rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
        if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
        rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
        if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
@@ -7817,118 +7820,31 @@ vdev_indirect_state_sync_verify(vdev_t *vd)
 }
 
 /*
 }
 
 /*
- * Sync the specified transaction group.  New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
  */
  */
-void
-spa_sync(spa_t *spa, uint64_t txg)
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 {
 {
-       dsl_pool_t *dp = spa->spa_dsl_pool;
-       objset_t *mos = spa->spa_meta_objset;
-       bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
-       metaslab_class_t *normal = spa_normal_class(spa);
-       metaslab_class_t *special = spa_special_class(spa);
-       metaslab_class_t *dedup = spa_dedup_class(spa);
+       ASSERT(spa_writeable(spa));
+
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *rvd = spa->spa_root_vdev;
-       vdev_t *vd;
-       dmu_tx_t *tx;
-       int error;
        uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
            zfs_vdev_queue_depth_pct / 100;
        uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
            zfs_vdev_queue_depth_pct / 100;
+       metaslab_class_t *normal = spa_normal_class(spa);
+       metaslab_class_t *special = spa_special_class(spa);
+       metaslab_class_t *dedup = spa_dedup_class(spa);
 
 
-       VERIFY(spa_writeable(spa));
-
-       /*
-        * Wait for i/os issued in open context that need to complete
-        * before this txg syncs.
-        */
-       (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
-       spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL);
-
-       /*
-        * Lock out configuration changes.
-        */
-       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
-       spa->spa_syncing_txg = txg;
-       spa->spa_sync_pass = 0;
-
-       for (int i = 0; i < spa->spa_alloc_count; i++) {
-               mutex_enter(&spa->spa_alloc_locks[i]);
-               VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-               mutex_exit(&spa->spa_alloc_locks[i]);
-       }
-
-       /*
-        * If there are any pending vdev state changes, convert them
-        * into config changes that go out with this transaction group.
-        */
-       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-       while (list_head(&spa->spa_state_dirty_list) != NULL) {
-               /*
-                * We need the write lock here because, for aux vdevs,
-                * calling vdev_config_dirty() modifies sav_config.
-                * This is ugly and will become unnecessary when we
-                * eliminate the aux vdev wart by integrating all vdevs
-                * into the root vdev tree.
-                */
-               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
-               while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-                       vdev_state_clean(vd);
-                       vdev_config_dirty(vd);
-               }
-               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
-       }
-       spa_config_exit(spa, SCL_STATE, FTAG);
-
-       tx = dmu_tx_create_assigned(dp, txg);
-
-       spa->spa_sync_starttime = gethrtime();
-       taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
-       spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
-           spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
-           NSEC_TO_TICK(spa->spa_deadman_synctime));
-
-       /*
-        * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
-        * set spa_deflate if we have no raid-z vdevs.
-        */
-       if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
-           spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
-               int i;
-
-               for (i = 0; i < rvd->vdev_children; i++) {
-                       vd = rvd->vdev_child[i];
-                       if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
-                               break;
-               }
-               if (i == rvd->vdev_children) {
-                       spa->spa_deflate = TRUE;
-                       VERIFY(0 == zap_add(spa->spa_meta_objset,
-                           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-                           sizeof (uint64_t), 1, &spa->spa_deflate, tx));
-               }
-       }
-
-       /*
-        * Set the top-level vdev's max queue depth. Evaluate each
-        * top-level's async write queue depth in case it changed.
-        * The max queue depth will not change in the middle of syncing
-        * out this txg.
-        */
        uint64_t slots_per_allocator = 0;
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *tvd = rvd->vdev_child[c];
        uint64_t slots_per_allocator = 0;
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *tvd = rvd->vdev_child[c];
-               metaslab_group_t *mg = tvd->vdev_mg;
-               metaslab_class_t *mc;
 
 
+               metaslab_group_t *mg = tvd->vdev_mg;
                if (mg == NULL || !metaslab_group_initialized(mg))
                        continue;
 
                if (mg == NULL || !metaslab_group_initialized(mg))
                        continue;
 
-               mc = mg->mg_class;
+               metaslab_class_t *mc = mg->mg_class;
                if (mc != normal && mc != special && mc != dedup)
                        continue;
 
                if (mc != normal && mc != special && mc != dedup)
                        continue;
 
@@ -7960,7 +7876,14 @@ spa_sync(spa_t *spa, uint64_t txg)
        normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
        special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
        dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
        normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
        special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
        dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+       ASSERT(spa_writeable(spa));
 
 
+       vdev_t *rvd = spa->spa_root_vdev;
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *vd = rvd->vdev_child[c];
                vdev_indirect_state_sync_verify(vd);
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *vd = rvd->vdev_child[c];
                vdev_indirect_state_sync_verify(vd);
@@ -7970,10 +7893,16 @@ spa_sync(spa_t *spa, uint64_t txg)
                        break;
                }
        }
                        break;
                }
        }
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+       objset_t *mos = spa->spa_meta_objset;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       uint64_t txg = tx->tx_txg;
+       bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 
 
-       /*
-        * Iterate to convergence.
-        */
        do {
                int pass = ++spa->spa_sync_pass;
 
        do {
                int pass = ++spa->spa_sync_pass;
 
@@ -7999,81 +7928,60 @@ spa_sync(spa_t *spa, uint64_t txg)
 
                ddt_sync(spa, txg);
                dsl_scan_sync(dp, tx);
 
                ddt_sync(spa, txg);
                dsl_scan_sync(dp, tx);
+               svr_sync(spa, tx);
+               spa_sync_upgrades(spa, tx);
 
 
-               if (spa->spa_vdev_removal != NULL)
-                       svr_sync(spa, tx);
-
+               vdev_t *vd = NULL;
                while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
                    != NULL)
                        vdev_sync(vd, txg);
 
                while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
                    != NULL)
                        vdev_sync(vd, txg);
 
-               if (pass == 1) {
-                       spa_sync_upgrades(spa, tx);
-                       ASSERT3U(txg, >=,
-                           spa->spa_uberblock.ub_rootbp.blk_birth);
+               /*
+                * Note: We need to check if the MOS is dirty because we could
+                * have marked the MOS dirty without updating the uberblock
+                * (e.g. if we have sync tasks but no dirty user data). We need
+                * to check the uberblock's rootbp because it is updated if we
+                * have synced out dirty data (though in this case the MOS will
+                * most likely also be dirty due to second order effects, we
+                * don't want to rely on that here).
+                */
+               if (pass == 1 &&
+                   spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+                   !dmu_objset_is_dirty(mos, txg)) {
                        /*
                        /*
-                        * Note: We need to check if the MOS is dirty
-                        * because we could have marked the MOS dirty
-                        * without updating the uberblock (e.g. if we
-                        * have sync tasks but no dirty user data).  We
-                        * need to check the uberblock's rootbp because
-                        * it is updated if we have synced out dirty
-                        * data (though in this case the MOS will most
-                        * likely also be dirty due to second order
-                        * effects, we don't want to rely on that here).
+                        * Nothing changed on the first pass, therefore this
+                        * TXG is a no-op. Avoid syncing deferred frees, so
+                        * that we can keep this TXG as a no-op.
                         */
                         */
-                       if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
-                           !dmu_objset_is_dirty(mos, txg)) {
-                               /*
-                                * Nothing changed on the first pass,
-                                * therefore this TXG is a no-op.  Avoid
-                                * syncing deferred frees, so that we
-                                * can keep this TXG as a no-op.
-                                */
-                               ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
-                                   txg));
-                               ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
-                               ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
-                               ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
-                                   txg));
-                               break;
-                       }
-                       spa_sync_deferred_frees(spa, tx);
+                       ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+                       ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+                       ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+                       ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+                       break;
                }
 
                }
 
+               spa_sync_deferred_frees(spa, tx);
        } while (dmu_objset_is_dirty(mos, txg));
        } while (dmu_objset_is_dirty(mos, txg));
+}
 
 
-#ifdef ZFS_DEBUG
-       if (!list_is_empty(&spa->spa_config_dirty_list)) {
-               /*
-                * Make sure that the number of ZAPs for all the vdevs matches
-                * the number of ZAPs in the per-vdev ZAP list. This only gets
-                * called if the config is dirty; otherwise there may be
-                * outstanding AVZ operations that weren't completed in
-                * spa_sync_config_object.
-                */
-               uint64_t all_vdev_zap_entry_count;
-               ASSERT0(zap_count(spa->spa_meta_objset,
-                   spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
-               ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
-                   all_vdev_zap_entry_count);
-       }
-#endif
-
-       if (spa->spa_vdev_removal != NULL) {
-               ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
-       }
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t txg = tx->tx_txg;
 
 
-       /*
-        * Rewrite the vdev configuration (which includes the uberblock)
-        * to commit the transaction group.
-        *
-        * If there are no dirty vdevs, we sync the uberblock to a few
-        * random top-level vdevs that are known to be visible in the
-        * config cache (see spa_vdev_add() for a complete description).
-        * If there *are* dirty vdevs, sync the uberblock to all vdevs.
-        */
        for (;;) {
        for (;;) {
+               int error = 0;
+
                /*
                 * We hold SCL_STATE to prevent vdev open/close/etc.
                 * while we're attempting to write the vdev labels.
                /*
                 * We hold SCL_STATE to prevent vdev open/close/etc.
                 * while we're attempting to write the vdev labels.
@@ -8087,13 +7995,15 @@ spa_sync(spa_t *spa, uint64_t txg)
                        int c0 = spa_get_random(children);
 
                        for (int c = 0; c < children; c++) {
                        int c0 = spa_get_random(children);
 
                        for (int c = 0; c < children; c++) {
-                               vd = rvd->vdev_child[(c0 + c) % children];
+                               vdev_t *vd =
+                                   rvd->vdev_child[(c0 + c) % children];
 
                                /* Stop when revisiting the first vdev */
                                if (c > 0 && svd[0] == vd)
                                        break;
 
 
                                /* Stop when revisiting the first vdev */
                                if (c > 0 && svd[0] == vd)
                                        break;
 
-                               if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+                               if (vd->vdev_ms_array == 0 ||
+                                   vd->vdev_islog ||
                                    !vdev_is_concrete(vd))
                                        continue;
 
                                    !vdev_is_concrete(vd))
                                        continue;
 
@@ -8117,6 +8027,124 @@ spa_sync(spa_t *spa, uint64_t txg)
                zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
                zio_resume_wait(spa);
        }
                zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
                zio_resume_wait(spa);
        }
+}
+
+/*
+ * Sync the specified transaction group.  New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+       vdev_t *vd = NULL;
+
+       VERIFY(spa_writeable(spa));
+
+       /*
+        * Wait for i/os issued in open context that need to complete
+        * before this txg syncs.
+        */
+       (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+       spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL);
+
+       /*
+        * Lock out configuration changes.
+        */
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+       spa->spa_syncing_txg = txg;
+       spa->spa_sync_pass = 0;
+
+       for (int i = 0; i < spa->spa_alloc_count; i++) {
+               mutex_enter(&spa->spa_alloc_locks[i]);
+               VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+               mutex_exit(&spa->spa_alloc_locks[i]);
+       }
+
+       /*
+        * If there are any pending vdev state changes, convert them
+        * into config changes that go out with this transaction group.
+        */
+       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+       while (list_head(&spa->spa_state_dirty_list) != NULL) {
+               /*
+                * We need the write lock here because, for aux vdevs,
+                * calling vdev_config_dirty() modifies sav_config.
+                * This is ugly and will become unnecessary when we
+                * eliminate the aux vdev wart by integrating all vdevs
+                * into the root vdev tree.
+                */
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+               while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+                       vdev_state_clean(vd);
+                       vdev_config_dirty(vd);
+               }
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+       }
+       spa_config_exit(spa, SCL_STATE, FTAG);
+
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+       spa->spa_sync_starttime = gethrtime();
+       taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+       spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+           spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+           NSEC_TO_TICK(spa->spa_deadman_synctime));
+
+       /*
+        * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+        * set spa_deflate if we have no raid-z vdevs.
+        */
+       if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+           spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+               vdev_t *rvd = spa->spa_root_vdev;
+
+               int i;
+               for (i = 0; i < rvd->vdev_children; i++) {
+                       vd = rvd->vdev_child[i];
+                       if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+                               break;
+               }
+               if (i == rvd->vdev_children) {
+                       spa->spa_deflate = TRUE;
+                       VERIFY0(zap_add(spa->spa_meta_objset,
+                           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+                           sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+               }
+       }
+
+       spa_sync_adjust_vdev_max_queue_depth(spa);
+
+       spa_sync_condense_indirect(spa, tx);
+
+       spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+       if (!list_is_empty(&spa->spa_config_dirty_list)) {
+       /*
+        * Make sure that the number of ZAPs for all the vdevs matches
+        * the number of ZAPs in the per-vdev ZAP list. This only gets
+        * called if the config is dirty; otherwise there may be
+        * outstanding AVZ operations that weren't completed in
+        * spa_sync_config_object.
+        */
+               uint64_t all_vdev_zap_entry_count;
+               ASSERT0(zap_count(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+               ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+                   all_vdev_zap_entry_count);
+       }
+#endif
+
+       if (spa->spa_vdev_removal != NULL) {
+               ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+       }
+
+       spa_sync_rewrite_vdev_config(spa, tx);
        dmu_tx_commit(tx);
 
        taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
        dmu_tx_commit(tx);
 
        taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
index 1896e824f8e0dc4b392307c69fb6bc6687ae81c2..8d89007872f8a912b1299621ffd4dac60a476d5c 100644 (file)
@@ -1798,6 +1798,9 @@ svr_sync(spa_t *spa, dmu_tx_t *tx)
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
+       if (svr == NULL)
+               return;
+
        /*
         * This check is necessary so that we do not dirty the
         * DIRECTORY_OBJECT via spa_sync_removing_state() when there
        /*
         * This check is necessary so that we do not dirty the
         * DIRECTORY_OBJECT via spa_sync_removing_state() when there