]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Improve log spacemap load time
authorAlexander Motin <mav@FreeBSD.org>
Tue, 26 Apr 2022 17:44:21 +0000 (13:44 -0400)
committerGitHub <noreply@github.com>
Tue, 26 Apr 2022 17:44:21 +0000 (10:44 -0700)
Previous flushing algorithm limited only total number of log blocks to
the minimum of 256K and 4x number of metaslabs in the pool.  As result,
system with 1500 disks with 1000 metaslabs each, touching several new
metaslabs each TXG could grow spacemap log to huge size without much
benefits.  We've observed one of such systems importing pool for about
45 minutes.

This patch improves the situation from five sides:
 - By limiting maximum period for each metaslab to be flushed to 1000
TXGs, that effectively limits maximum number of per-TXG spacemap logs
to load to the same number.
 - By making flushing more smooth via accounting number of metaslabs
that were touched after the last flush and actually need another flush,
not just ms_unflushed_txg bump.
 - By applying zfs_unflushed_log_block_pct to the number of metaslabs
that were touched after the last flush, not all metaslabs in the pool.
 - By aggressively prefetching per-TXG spacemap logs up to 16 TXGs in
advance, making log spacemap load process for wide HDD pool CPU-bound,
accelerating it by many times.
 - By reducing zfs_unflushed_log_block_max from 256K to 128K, reducing
single-threaded by nature log processing time from ~10 to ~5 minutes.

As further optimization we could skip bumping ms_unflushed_txg for
metaslabs not touched since the last flush, but that would be an
incompatible change, requiring new pool feature.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12789

include/sys/dmu.h
include/sys/metaslab.h
include/sys/metaslab_impl.h
include/sys/spa_log_spacemap.h
man/man4/zfs.4
module/zfs/dmu.c
module/zfs/metaslab.c
module/zfs/spa.c
module/zfs/spa_log_spacemap.c
module/zfs/vdev.c
module/zfs/vdev_removal.c

index 1ddff0d4e4e7c47d915307ecec1c4b634e5778ce..03513f9f2c6c44c71ed483f51cf06c143a652517 100644 (file)
@@ -1067,6 +1067,8 @@ int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
 #define        ZFS_CRC64_POLY  0xC96C5795D7870F42ULL   /* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
+extern int dmu_prefetch_max;
+
 #ifdef __cplusplus
 }
 #endif
index 129a68be41c53e7f13b5e874c0cd5438ee681819..b777a3cae439ac6782a50b729f8b19f48c64deeb 100644 (file)
@@ -49,11 +49,14 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
     metaslab_t **);
 void metaslab_fini(metaslab_t *);
 
+void metaslab_set_unflushed_dirty(metaslab_t *, boolean_t);
 void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *);
 void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *);
+boolean_t metaslab_unflushed_dirty(metaslab_t *);
 uint64_t metaslab_unflushed_txg(metaslab_t *);
 uint64_t metaslab_estimated_condensed_size(metaslab_t *);
 int metaslab_sort_by_flushed(const void *, const void *);
+void metaslab_unflushed_bump(metaslab_t *, dmu_tx_t *, boolean_t);
 uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
 
 int metaslab_load(metaslab_t *);
index 3dbee4c17fef2a724826488fee23bd43c1467b3b..820c61a252e2bd78c80344d68408a20dc5655cbf 100644 (file)
@@ -553,6 +553,7 @@ struct metaslab {
         * log space maps.
         */
        uint64_t        ms_unflushed_txg;
+       boolean_t       ms_unflushed_dirty;
 
        /* updated every time we are done syncing the metaslab's space map */
        uint64_t        ms_synced_length;
index b2ed77fac3e4068b5b88a3858475febb0b83571f..72229df6cd1690505529e70d2f927419e972cac4 100644 (file)
 
 typedef struct log_summary_entry {
        uint64_t lse_start;     /* start TXG */
+       uint64_t lse_end;       /* last TXG */
+       uint64_t lse_txgcount;  /* # of TXGs */
        uint64_t lse_mscount;   /* # of metaslabs needed to be flushed */
+       uint64_t lse_msdcount;  /* # of dirty metaslabs needed to be flushed */
        uint64_t lse_blkcount;  /* blocks held by this entry  */
        list_node_t lse_node;
 } log_summary_entry_t;
@@ -50,6 +53,7 @@ typedef struct spa_log_sm {
        uint64_t sls_nblocks;   /* number of blocks in this log */
        uint64_t sls_mscount;   /* # of metaslabs flushed in the log's txg */
        avl_node_t sls_node;    /* node in spa_sm_logs_by_txg */
+       space_map_t *sls_sm;    /* space map pointer, if open */
 } spa_log_sm_t;
 
 int spa_ld_log_spacemaps(spa_t *);
@@ -68,8 +72,9 @@ uint64_t spa_log_sm_memused(spa_t *);
 void spa_log_sm_decrement_mscount(spa_t *, uint64_t);
 void spa_log_sm_increment_current_mscount(spa_t *);
 
-void spa_log_summary_add_flushed_metaslab(spa_t *);
-void spa_log_summary_decrement_mscount(spa_t *, uint64_t);
+void spa_log_summary_add_flushed_metaslab(spa_t *, boolean_t);
+void spa_log_summary_dirty_flushed_metaslab(spa_t *, uint64_t);
+void spa_log_summary_decrement_mscount(spa_t *, uint64_t, boolean_t);
 void spa_log_summary_decrement_blkcount(spa_t *, uint64_t);
 
 boolean_t spa_flush_all_logs_requested(spa_t *);
index 546ed78d9c69c881cd0490e0914c046fe898d6eb..a18917eb1e420183d4bd52966c42a0bce79ccf2b 100644 (file)
@@ -982,13 +982,13 @@ log spacemap in memory, in bytes.
 Part of overall system memory that ZFS allows to be used
 for unflushed metadata changes by the log spacemap, in millionths.
 .
-.It Sy zfs_unflushed_log_block_max Ns = Ns Sy 262144 Po 256k Pc Pq ulong
+.It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq ulong
 Describes the maximum number of log spacemap blocks allowed for each pool.
 The default value means that the space in all the log spacemaps
 can add up to no more than
-.Sy 262144
+.Sy 131072
 blocks (which means
-.Em 32GB
+.Em 16GB
 of logical space before compression and ditto blocks,
 assuming that blocksize is
 .Em 128kB ) .
@@ -1018,7 +1018,12 @@ Thus we always allow at least this many log blocks.
 .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq ulong
 Tunable used to determine the number of blocks that can be used for
 the spacemap log, expressed as a percentage of the total number of
-metaslabs in the pool.
+unflushed metaslabs in the pool.
+.
+.It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq ulong
+Tunable limiting maximum time in TXGs any metaslab may remain unflushed.
+It effectively limits maximum number of unflushed per-TXG spacemap logs
+that need to be read after unclean pool export.
 .
 .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When enabled, files will not be asynchronously removed from the list of pending
index 461feeffb6a3f32a1c9818d00277c24dcc1ea606..7d8b2c96bd74893e9ae36989dd1acf3956e180cb 100644 (file)
@@ -86,7 +86,7 @@ static int zfs_dmu_offset_next_sync = 1;
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
-static int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
        {DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"           },
index 7ed83b305db7917f135d5296cf2dd38ca311a0d8..f9c16f00d51ab37b22cd9e5b099fce69339379ac 100644 (file)
@@ -2750,7 +2750,8 @@ metaslab_fini_flush_data(metaslab_t *msp)
        mutex_exit(&spa->spa_flushed_ms_lock);
 
        spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
-       spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+       spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
+           metaslab_unflushed_dirty(msp));
 }
 
 uint64_t
@@ -3728,50 +3729,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
        metaslab_flush_update(msp, tx);
 }
 
-/*
- * Called when the metaslab has been flushed (its own spacemap now reflects
- * all the contents of the pool-wide spacemap log). Updates the metaslab's
- * metadata and any pool-wide related log space map data (e.g. summary,
- * obsolete logs, etc..) to reflect that.
- */
 static void
-metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 {
-       metaslab_group_t *mg = msp->ms_group;
-       spa_t *spa = mg->mg_vd->vdev_spa;
-
-       ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-       ASSERT3U(spa_sync_pass(spa), ==, 1);
+       spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+       ASSERT(spa_syncing_log_sm(spa) != NULL);
+       ASSERT(msp->ms_sm != NULL);
        ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
        ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
-       /*
-        * Just because a metaslab got flushed, that doesn't mean that
-        * it will pass through metaslab_sync_done(). Thus, make sure to
-        * update ms_synced_length here in case it doesn't.
-        */
-       msp->ms_synced_length = space_map_length(msp->ms_sm);
+       mutex_enter(&spa->spa_flushed_ms_lock);
+       metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+       metaslab_set_unflushed_dirty(msp, B_TRUE);
+       avl_add(&spa->spa_metaslabs_by_flushed, msp);
+       mutex_exit(&spa->spa_flushed_ms_lock);
 
-       /*
-        * We may end up here from metaslab_condense() without the
-        * feature being active. In that case this is a no-op.
-        */
-       if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
-               return;
+       spa_log_sm_increment_current_mscount(spa);
+       spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
+}
 
+void
+metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
+{
+       spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
        ASSERT(spa_syncing_log_sm(spa) != NULL);
        ASSERT(msp->ms_sm != NULL);
        ASSERT(metaslab_unflushed_txg(msp) != 0);
        ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+       ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+       ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
        VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 
        /* update metaslab's position in our flushing tree */
        uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+       boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
        mutex_enter(&spa->spa_flushed_ms_lock);
        avl_remove(&spa->spa_metaslabs_by_flushed, msp);
        metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+       metaslab_set_unflushed_dirty(msp, dirty);
        avl_add(&spa->spa_metaslabs_by_flushed, msp);
        mutex_exit(&spa->spa_flushed_ms_lock);
 
@@ -3779,17 +3775,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
        spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
        spa_log_sm_increment_current_mscount(spa);
 
+       /* update log space map summary */
+       spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
+           ms_prev_flushed_dirty);
+       spa_log_summary_add_flushed_metaslab(spa, dirty);
+
        /* cleanup obsolete logs if any */
-       uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
        spa_cleanup_old_sm_logs(spa, tx);
-       uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
-       VERIFY3U(log_blocks_after, <=, log_blocks_before);
+}
 
-       /* update log space map summary */
-       uint64_t blocks_gone = log_blocks_before - log_blocks_after;
-       spa_log_summary_add_flushed_metaslab(spa);
-       spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
-       spa_log_summary_decrement_blkcount(spa, blocks_gone);
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+       metaslab_group_t *mg = msp->ms_group;
+       spa_t *spa = mg->mg_vd->vdev_spa;
+
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+       ASSERT3U(spa_sync_pass(spa), ==, 1);
+
+       /*
+        * Just because a metaslab got flushed, that doesn't mean that
+        * it will pass through metaslab_sync_done(). Thus, make sure to
+        * update ms_synced_length here in case it doesn't.
+        */
+       msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+       /*
+        * We may end up here from metaslab_condense() without the
+        * feature being active. In that case this is a no-op.
+        */
+       if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
+           metaslab_unflushed_txg(msp) == 0)
+               return;
+
+       metaslab_unflushed_bump(msp, tx, B_FALSE);
 }
 
 boolean_t
@@ -4005,23 +4031,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                ASSERT0(metaslab_allocated_space(msp));
        }
 
-       if (metaslab_unflushed_txg(msp) == 0 &&
-           spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
-               ASSERT(spa_syncing_log_sm(spa) != NULL);
-
-               metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
-               spa_log_sm_increment_current_mscount(spa);
-               spa_log_summary_add_flushed_metaslab(spa);
-
-               ASSERT(msp->ms_sm != NULL);
-               mutex_enter(&spa->spa_flushed_ms_lock);
-               avl_add(&spa->spa_metaslabs_by_flushed, msp);
-               mutex_exit(&spa->spa_flushed_ms_lock);
-
-               ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
-               ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
-       }
-
        if (!range_tree_is_empty(msp->ms_checkpointing) &&
            vd->vdev_checkpoint_sm == NULL) {
                ASSERT(spa_has_checkpoint(spa));
@@ -4069,6 +4078,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        space_map_t *log_sm = spa_syncing_log_sm(spa);
        if (log_sm != NULL) {
                ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+               if (metaslab_unflushed_txg(msp) == 0)
+                       metaslab_unflushed_add(msp, tx);
+               else if (!metaslab_unflushed_dirty(msp))
+                       metaslab_unflushed_bump(msp, tx, B_TRUE);
 
                space_map_write(log_sm, alloctree, SM_ALLOC,
                    vd->vdev_id, tx);
@@ -6131,6 +6144,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
        mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
+void
+metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
+{
+       ms->ms_unflushed_dirty = dirty;
+}
+
 static void
 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 {
@@ -6167,15 +6186,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 void
 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 {
-       spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
-
-       if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
-               return;
-
        ms->ms_unflushed_txg = txg;
        metaslab_update_ondisk_flush_data(ms, tx);
 }
 
+boolean_t
+metaslab_unflushed_dirty(metaslab_t *ms)
+{
+       return (ms->ms_unflushed_dirty);
+}
+
 uint64_t
 metaslab_unflushed_txg(metaslab_t *ms)
 {
index e69cb5527be80fad82b6ea82386476dbc9cb2388..01114dedef48b433e941958ab013217954860a8e 100644 (file)
@@ -4355,7 +4355,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
 
        error = spa_ld_log_spacemaps(spa);
        if (error != 0) {
-               spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+               spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
                    error);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
        }
index 110a4eab99f9bc9c2821a0e6831c61c0ef4c5370..f831509a4247448466326e9a1ea52b75cbd32609 100644 (file)
@@ -257,7 +257,12 @@ static unsigned long zfs_unflushed_log_block_min = 1000;
  * terms of performance. Thus we have a hard limit in the size of the log in
  * terms of blocks.
  */
-static unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+static unsigned long zfs_unflushed_log_block_max = (1ULL << 17);
+
+/*
+ * Also we have a hard limit in the size of the log in terms of dirty TXGs.
+ */
+static unsigned long zfs_unflushed_log_txg_max = 1000;
 
 /*
  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
@@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa)
                return;
        }
 
-       uint64_t calculated_limit =
-           (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
-       spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+       uint64_t msdcount = 0;
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e; e = list_next(&spa->spa_log_summary, e))
+               msdcount += e->lse_msdcount;
+
+       uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
+       spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
            zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
 }
 
@@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa)
 }
 
 static boolean_t
-summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
 {
+       if (e->lse_end == txg)
+               return (0);
+       if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
+           zfs_max_logsm_summary_length))
+               return (1);
        uint64_t blocks_per_row = MAX(1,
            DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
            zfs_max_logsm_summary_length));
@@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
  * the metaslab.
  */
 void
-spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
 {
        /*
         * We don't track summary data for read-only pools and this function
@@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
        }
 
        target->lse_mscount--;
+       if (dirty)
+               target->lse_msdcount--;
 }
 
 /*
@@ -490,8 +506,10 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
 void
 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
 {
-       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-           e != NULL; e = list_head(&spa->spa_log_summary)) {
+       log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+       if (e->lse_txgcount > 0)
+               e->lse_txgcount--;
+       for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
                if (e->lse_blkcount > blocks_gone) {
                        /*
                         * Assert that we stopped at an entry that is not
@@ -560,31 +578,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa)
 
 static void
 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
-    uint64_t nblocks)
+    uint64_t metaslabs_dirty, uint64_t nblocks)
 {
        log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
 
-       if (e == NULL || summary_entry_is_full(spa, e)) {
+       if (e == NULL || summary_entry_is_full(spa, e, txg)) {
                e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
-               e->lse_start = txg;
+               e->lse_start = e->lse_end = txg;
+               e->lse_txgcount = 1;
                list_insert_tail(&spa->spa_log_summary, e);
        }
 
        ASSERT3U(e->lse_start, <=, txg);
+       if (e->lse_end < txg) {
+               e->lse_end = txg;
+               e->lse_txgcount++;
+       }
        e->lse_mscount += metaslabs_flushed;
+       e->lse_msdcount += metaslabs_dirty;
        e->lse_blkcount += nblocks;
 }
 
 static void
 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
 {
-       summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+       summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
 }
 
 void
-spa_log_summary_add_flushed_metaslab(spa_t *spa)
+spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
 {
-       summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+       summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
+}
+
+void
+spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
+{
+       log_summary_entry_t *target = NULL;
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+               if (e->lse_start > txg)
+                       break;
+               target = e;
+       }
+       ASSERT3P(target, !=, NULL);
+       ASSERT3U(target->lse_mscount, !=, 0);
+       target->lse_msdcount++;
 }
 
 /*
@@ -630,6 +669,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
        int64_t available_blocks =
            spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
 
+       int64_t available_txgs = zfs_unflushed_log_txg_max;
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e; e = list_next(&spa->spa_log_summary, e))
+               available_txgs -= e->lse_txgcount;
+
        /*
         * This variable tells us the total number of flushes needed to
         * keep the log size within the limit when we reach txgs_in_future.
@@ -637,9 +681,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
        uint64_t total_flushes = 0;
 
        /* Holds the current maximum of our estimates so far. */
-       uint64_t max_flushes_pertxg =
-           MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
-           zfs_min_metaslabs_to_flush);
+       uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
 
        /*
         * For our estimations we only look as far in the future
@@ -653,11 +695,14 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
                 * then keep skipping TXGs accumulating more blocks
                 * based on the incoming rate until we exceed it.
                 */
-               if (available_blocks >= 0) {
-                       uint64_t skip_txgs = (available_blocks / incoming) + 1;
+               if (available_blocks >= 0 && available_txgs >= 0) {
+                       uint64_t skip_txgs = MIN(available_txgs + 1,
+                           (available_blocks / incoming) + 1);
                        available_blocks -= (skip_txgs * incoming);
+                       available_txgs -= skip_txgs;
                        txgs_in_future += skip_txgs;
                        ASSERT3S(available_blocks, >=, -incoming);
+                       ASSERT3S(available_txgs, >=, -1);
                }
 
                /*
@@ -666,9 +711,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
                 * based on the current entry in the summary, updating
                 * our available_blocks.
                 */
-               ASSERT3S(available_blocks, <, 0);
+               ASSERT(available_blocks < 0 || available_txgs < 0);
                available_blocks += e->lse_blkcount;
-               total_flushes += e->lse_mscount;
+               available_txgs += e->lse_txgcount;
+               total_flushes += e->lse_msdcount;
 
                /*
                 * Keep the running maximum of the total_flushes that
@@ -680,8 +726,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
                 */
                max_flushes_pertxg = MAX(max_flushes_pertxg,
                    DIV_ROUND_UP(total_flushes, txgs_in_future));
-               ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
-                   max_flushes_pertxg);
        }
        return (max_flushes_pertxg);
 }
@@ -771,14 +815,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
        uint64_t want_to_flush;
        if (spa_flush_all_logs_requested(spa)) {
                ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
-               want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+               want_to_flush = UINT64_MAX;
        } else {
                want_to_flush = spa_estimate_metaslabs_to_flush(spa);
        }
 
-       ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
-           want_to_flush);
-
        /* Used purely for verification purposes */
        uint64_t visited = 0;
 
@@ -809,31 +850,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
                if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
                        break;
 
-               mutex_enter(&curr->ms_sync_lock);
-               mutex_enter(&curr->ms_lock);
-               boolean_t flushed = metaslab_flush(curr, tx);
-               mutex_exit(&curr->ms_lock);
-               mutex_exit(&curr->ms_sync_lock);
-
-               /*
-                * If we failed to flush a metaslab (because it was loading),
-                * then we are done with the block heuristic as it's not
-                * possible to destroy any log space maps once you've skipped
-                * a metaslab. In that case we just set our counter to 0 but
-                * we continue looping in case there is still memory pressure
-                * due to unflushed changes. Note that, flushing a metaslab
-                * that is not the oldest flushed in the pool, will never
-                * destroy any log space maps [see spa_cleanup_old_sm_logs()].
-                */
-               if (!flushed) {
-                       want_to_flush = 0;
-               } else if (want_to_flush > 0) {
-                       want_to_flush--;
-               }
+               if (metaslab_unflushed_dirty(curr)) {
+                       mutex_enter(&curr->ms_sync_lock);
+                       mutex_enter(&curr->ms_lock);
+                       metaslab_flush(curr, tx);
+                       mutex_exit(&curr->ms_lock);
+                       mutex_exit(&curr->ms_sync_lock);
+                       if (want_to_flush > 0)
+                               want_to_flush--;
+               } else
+                       metaslab_unflushed_bump(curr, tx, B_FALSE);
 
                visited++;
        }
        ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+
+       spa_log_sm_set_blocklimit(spa);
 }
 
 /*
@@ -904,6 +936,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
                avl_remove(&spa->spa_sm_logs_by_txg, sls);
                space_map_free_obj(mos, sls->sls_sm_obj, tx);
                VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+               spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
                spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
                kmem_free(sls, sizeof (spa_log_sm_t));
        }
@@ -963,12 +996,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
        VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
            0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
-       /*
-        * If the log space map feature was just enabled, the blocklimit
-        * has not yet been set.
-        */
-       if (spa_log_sm_blocklimit(spa) == 0)
-               spa_log_sm_set_blocklimit(spa);
+       spa_log_sm_set_blocklimit(spa);
 }
 
 /*
@@ -1094,12 +1122,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
                panic("invalid maptype_t");
                break;
        }
+       if (!metaslab_unflushed_dirty(ms)) {
+               metaslab_set_unflushed_dirty(ms, B_TRUE);
+               spa_log_summary_dirty_flushed_metaslab(spa,
+                   metaslab_unflushed_txg(ms));
+       }
        return (0);
 }
 
 static int
 spa_ld_log_sm_data(spa_t *spa)
 {
+       spa_log_sm_t *sls, *psls;
        int error = 0;
 
        /*
@@ -1113,41 +1147,71 @@ spa_ld_log_sm_data(spa_t *spa)
        ASSERT0(spa->spa_unflushed_stats.sus_memused);
 
        hrtime_t read_logs_starttime = gethrtime();
-       /* this is a no-op when we don't have space map logs */
-       for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
-           sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-               space_map_t *sm = NULL;
-               error = space_map_open(&sm, spa_meta_objset(spa),
-                   sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
-               if (error != 0) {
-                       spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
-                           "space_map_open(obj=%llu) [error %d]",
-                           (u_longlong_t)sls->sls_sm_obj, error);
-                       goto out;
+
+       /* Prefetch log spacemaps dnodes. */
+       for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
+           sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+               dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
+                   0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+       }
+
+       uint_t pn = 0;
+       uint64_t ps = 0;
+       psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
+       while (sls != NULL) {
+               /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
+               if (psls != NULL && pn < 16 &&
+                   (pn < 2 || ps < 2 * dmu_prefetch_max)) {
+                       error = space_map_open(&psls->sls_sm,
+                           spa_meta_objset(spa), psls->sls_sm_obj, 0,
+                           UINT64_MAX, SPA_MINBLOCKSHIFT);
+                       if (error != 0) {
+                               spa_load_failed(spa, "spa_ld_log_sm_data(): "
+                                   "failed at space_map_open(obj=%llu) "
+                                   "[error %d]",
+                                   (u_longlong_t)sls->sls_sm_obj, error);
+                               goto out;
+                       }
+                       dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
+                           0, 0, space_map_length(psls->sls_sm),
+                           ZIO_PRIORITY_ASYNC_READ);
+                       pn++;
+                       ps += space_map_length(psls->sls_sm);
+                       psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
+                       continue;
                }
 
+               /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
+               cond_resched();
+               ASSERT0(sls->sls_nblocks);
+               sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
+               spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+               summary_add_data(spa, sls->sls_txg,
+                   sls->sls_mscount, 0, sls->sls_nblocks);
+
                struct spa_ld_log_sm_arg vla = {
                        .slls_spa = spa,
                        .slls_txg = sls->sls_txg
                };
-               error = space_map_iterate(sm, space_map_length(sm),
-                   spa_ld_log_sm_cb, &vla);
+               error = space_map_iterate(sls->sls_sm,
+                   space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
                if (error != 0) {
-                       space_map_close(sm);
                        spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
                            "at space_map_iterate(obj=%llu) [error %d]",
                            (u_longlong_t)sls->sls_sm_obj, error);
                        goto out;
                }
 
-               ASSERT0(sls->sls_nblocks);
-               sls->sls_nblocks = space_map_nblocks(sm);
-               spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
-               summary_add_data(spa, sls->sls_txg,
-                   sls->sls_mscount, sls->sls_nblocks);
+               pn--;
+               ps -= space_map_length(sls->sls_sm);
+               space_map_close(sls->sls_sm);
+               sls->sls_sm = NULL;
+               sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
 
-               space_map_close(sm);
+               /* Update log block limits considering just loaded. */
+               spa_log_sm_set_blocklimit(spa);
        }
+
        hrtime_t read_logs_endtime = gethrtime();
        spa_load_note(spa,
            "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
@@ -1157,6 +1221,18 @@ spa_ld_log_sm_data(spa_t *spa)
            (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
 
 out:
+       if (error != 0) {
+               for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+                   sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+                       if (sls->sls_sm) {
+                               space_map_close(sls->sls_sm);
+                               sls->sls_sm = NULL;
+                       }
+               }
+       } else {
+               ASSERT0(pn);
+               ASSERT0(ps);
+       }
        /*
         * Now that the metaslabs contain their unflushed changes:
         * [1] recalculate their actual allocated space
@@ -1237,6 +1313,9 @@ spa_ld_unflushed_txgs(vdev_t *vd)
                }
 
                ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+               ms->ms_unflushed_dirty = B_FALSE;
+               ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
+               ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
                if (ms->ms_unflushed_txg != 0) {
                        mutex_enter(&spa->spa_flushed_ms_lock);
                        avl_add(&spa->spa_metaslabs_by_flushed, ms);
@@ -1300,6 +1379,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
        "Lower-bound limit for the maximum amount of blocks allowed in "
        "log spacemap (see zfs_unflushed_log_block_max)");
 
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW,
+    "Hard limit (upper-bound) in the size of the space map log "
+    "in terms of dirty TXGs.");
+
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
        "Tunable used to determine the number of blocks that can be used for "
        "the spacemap log, expressed as a percentage of the total number of "
index db2d2c5e44fb76e21af26010d46798d5b285f317..ce7f020a0d86f701953aabc8e5456ca702954f9a 100644 (file)
@@ -1523,13 +1523,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
        if (txg == 0)
                spa_config_exit(spa, SCL_ALLOC, FTAG);
 
-       /*
-        * Regardless whether this vdev was just added or it is being
-        * expanded, the metaslab count has changed. Recalculate the
-        * block limit.
-        */
-       spa_log_sm_set_blocklimit(spa);
-
        return (0);
 }
 
index f988ca22fa4a2f19063bcccd9901844a55b6dba0..5508d273758d6b766ebd8042c4bc650385400591 100644 (file)
@@ -1386,7 +1386,6 @@ vdev_remove_complete(spa_t *spa)
                vdev_metaslab_fini(vd);
                metaslab_group_destroy(vd->vdev_mg);
                vd->vdev_mg = NULL;
-               spa_log_sm_set_blocklimit(spa);
        }
        if (vd->vdev_log_mg != NULL) {
                ASSERT0(vd->vdev_ms_count);
@@ -2131,7 +2130,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
         * metaslab_class_histogram_verify()
         */
        vdev_metaslab_fini(vd);
-       spa_log_sm_set_blocklimit(spa);
 
        spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
        *txg = spa_vdev_config_enter(spa);