Improve log spacemap load time

[mirror_zfs.git] / module / zfs / spa_log_spacemap.c
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c

index 110a4eab99f9bc9c2821a0e6831c61c0ef4c5370..f831509a4247448466326e9a1ea52b75cbd32609 100644 (file)
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -257,7 +257,12 @@ static unsigned long zfs_unflushed_log_block_min = 1000;
   * terms of performance. Thus we have a hard limit in the size of the log in
   * terms of blocks.
   */
-static unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+static unsigned long zfs_unflushed_log_block_max = (1ULL << 17);
+
+/*
+ * Also we have a hard limit in the size of the log in terms of dirty TXGs.
+ */
+static unsigned long zfs_unflushed_log_txg_max = 1000;
  
  /*
   * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
@@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa)
                 return;
         }
  
-       uint64_t calculated_limit =
-           (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
-       spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+       uint64_t msdcount = 0;
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e; e = list_next(&spa->spa_log_summary, e))
+               msdcount += e->lse_msdcount;
+
+       uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
+       spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
             zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
  }
  
@@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa)
  }
  
  static boolean_t
-summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
  {
+       if (e->lse_end == txg)
+               return (0);
+       if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
+           zfs_max_logsm_summary_length))
+               return (1);
         uint64_t blocks_per_row = MAX(1,
             DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
             zfs_max_logsm_summary_length));
@@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
   * the metaslab.
   */
  void
-spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
  {
         /*
          * We don't track summary data for read-only pools and this function
@@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
         }
  
         target->lse_mscount--;
+       if (dirty)
+               target->lse_msdcount--;
  }
  
  /*
@@ -490,8 +506,10 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
  void
  spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
  {
-       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-           e != NULL; e = list_head(&spa->spa_log_summary)) {
+       log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+       if (e->lse_txgcount > 0)
+               e->lse_txgcount--;
+       for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
                 if (e->lse_blkcount > blocks_gone) {
                         /*
                          * Assert that we stopped at an entry that is not
@@ -560,31 +578,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa)
  
  static void
  summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
-    uint64_t nblocks)
+    uint64_t metaslabs_dirty, uint64_t nblocks)
  {
         log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
  
-       if (e == NULL || summary_entry_is_full(spa, e)) {
+       if (e == NULL || summary_entry_is_full(spa, e, txg)) {
                 e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
-               e->lse_start = txg;
+               e->lse_start = e->lse_end = txg;
+               e->lse_txgcount = 1;
                 list_insert_tail(&spa->spa_log_summary, e);
         }
  
         ASSERT3U(e->lse_start, <=, txg);
+       if (e->lse_end < txg) {
+               e->lse_end = txg;
+               e->lse_txgcount++;
+       }
         e->lse_mscount += metaslabs_flushed;
+       e->lse_msdcount += metaslabs_dirty;
         e->lse_blkcount += nblocks;
  }
  
  static void
  spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
  {
-       summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+       summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
  }
  
  void
-spa_log_summary_add_flushed_metaslab(spa_t *spa)
+spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
  {
-       summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+       summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
+}
+
+void
+spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
+{
+       log_summary_entry_t *target = NULL;
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+               if (e->lse_start > txg)
+                       break;
+               target = e;
+       }
+       ASSERT3P(target, !=, NULL);
+       ASSERT3U(target->lse_mscount, !=, 0);
+       target->lse_msdcount++;
  }
  
  /*
@@ -630,6 +669,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
         int64_t available_blocks =
             spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
  
+       int64_t available_txgs = zfs_unflushed_log_txg_max;
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e; e = list_next(&spa->spa_log_summary, e))
+               available_txgs -= e->lse_txgcount;
+
         /*
          * This variable tells us the total number of flushes needed to
          * keep the log size within the limit when we reach txgs_in_future.
@@ -637,9 +681,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
         uint64_t total_flushes = 0;
  
         /* Holds the current maximum of our estimates so far. */
-       uint64_t max_flushes_pertxg =
-           MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
-           zfs_min_metaslabs_to_flush);
+       uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
  
         /*
          * For our estimations we only look as far in the future
@@ -653,11 +695,14 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
                  * then keep skipping TXGs accumulating more blocks
                  * based on the incoming rate until we exceed it.
                  */
-               if (available_blocks >= 0) {
-                       uint64_t skip_txgs = (available_blocks / incoming) + 1;
+               if (available_blocks >= 0 && available_txgs >= 0) {
+                       uint64_t skip_txgs = MIN(available_txgs + 1,
+                           (available_blocks / incoming) + 1);
                         available_blocks -= (skip_txgs * incoming);
+                       available_txgs -= skip_txgs;
                         txgs_in_future += skip_txgs;
                         ASSERT3S(available_blocks, >=, -incoming);
+                       ASSERT3S(available_txgs, >=, -1);
                 }
  
                 /*
@@ -666,9 +711,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
                  * based on the current entry in the summary, updating
                  * our available_blocks.
                  */
-               ASSERT3S(available_blocks, <, 0);
+               ASSERT(available_blocks < 0 || available_txgs < 0);
                 available_blocks += e->lse_blkcount;
-               total_flushes += e->lse_mscount;
+               available_txgs += e->lse_txgcount;
+               total_flushes += e->lse_msdcount;
  
                 /*
                  * Keep the running maximum of the total_flushes that
@@ -680,8 +726,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
                  */
                 max_flushes_pertxg = MAX(max_flushes_pertxg,
                     DIV_ROUND_UP(total_flushes, txgs_in_future));
-               ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
-                   max_flushes_pertxg);
         }
         return (max_flushes_pertxg);
  }
@@ -771,14 +815,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
         uint64_t want_to_flush;
         if (spa_flush_all_logs_requested(spa)) {
                 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
-               want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+               want_to_flush = UINT64_MAX;
         } else {
                 want_to_flush = spa_estimate_metaslabs_to_flush(spa);
         }
  
-       ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
-           want_to_flush);
-
         /* Used purely for verification purposes */
         uint64_t visited = 0;
  
@@ -809,31 +850,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
                 if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
                         break;
  
-               mutex_enter(&curr->ms_sync_lock);
-               mutex_enter(&curr->ms_lock);
-               boolean_t flushed = metaslab_flush(curr, tx);
-               mutex_exit(&curr->ms_lock);
-               mutex_exit(&curr->ms_sync_lock);
-
-               /*
-                * If we failed to flush a metaslab (because it was loading),
-                * then we are done with the block heuristic as it's not
-                * possible to destroy any log space maps once you've skipped
-                * a metaslab. In that case we just set our counter to 0 but
-                * we continue looping in case there is still memory pressure
-                * due to unflushed changes. Note that, flushing a metaslab
-                * that is not the oldest flushed in the pool, will never
-                * destroy any log space maps [see spa_cleanup_old_sm_logs()].
-                */
-               if (!flushed) {
-                       want_to_flush = 0;
-               } else if (want_to_flush > 0) {
-                       want_to_flush--;
-               }
+               if (metaslab_unflushed_dirty(curr)) {
+                       mutex_enter(&curr->ms_sync_lock);
+                       mutex_enter(&curr->ms_lock);
+                       metaslab_flush(curr, tx);
+                       mutex_exit(&curr->ms_lock);
+                       mutex_exit(&curr->ms_sync_lock);
+                       if (want_to_flush > 0)
+                               want_to_flush--;
+               } else
+                       metaslab_unflushed_bump(curr, tx, B_FALSE);
  
                 visited++;
         }
         ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+
+       spa_log_sm_set_blocklimit(spa);
  }
  
  /*
@@ -904,6 +936,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
                 avl_remove(&spa->spa_sm_logs_by_txg, sls);
                 space_map_free_obj(mos, sls->sls_sm_obj, tx);
                 VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+               spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
                 spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
                 kmem_free(sls, sizeof (spa_log_sm_t));
         }
@@ -963,12 +996,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
         VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
             0, UINT64_MAX, SPA_MINBLOCKSHIFT));
  
-       /*
-        * If the log space map feature was just enabled, the blocklimit
-        * has not yet been set.
-        */
-       if (spa_log_sm_blocklimit(spa) == 0)
-               spa_log_sm_set_blocklimit(spa);
+       spa_log_sm_set_blocklimit(spa);
  }
  
  /*
@@ -1094,12 +1122,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
                 panic("invalid maptype_t");
                 break;
         }
+       if (!metaslab_unflushed_dirty(ms)) {
+               metaslab_set_unflushed_dirty(ms, B_TRUE);
+               spa_log_summary_dirty_flushed_metaslab(spa,
+                   metaslab_unflushed_txg(ms));
+       }
         return (0);
  }
  
  static int
  spa_ld_log_sm_data(spa_t *spa)
  {
+       spa_log_sm_t *sls, *psls;
         int error = 0;
  
         /*
@@ -1113,41 +1147,71 @@ spa_ld_log_sm_data(spa_t *spa)
         ASSERT0(spa->spa_unflushed_stats.sus_memused);
  
         hrtime_t read_logs_starttime = gethrtime();
-       /* this is a no-op when we don't have space map logs */
-       for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
-           sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-               space_map_t *sm = NULL;
-               error = space_map_open(&sm, spa_meta_objset(spa),
-                   sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
-               if (error != 0) {
-                       spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
-                           "space_map_open(obj=%llu) [error %d]",
-                           (u_longlong_t)sls->sls_sm_obj, error);
-                       goto out;
+
+       /* Prefetch log spacemaps dnodes. */
+       for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
+           sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+               dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
+                   0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+       }
+
+       uint_t pn = 0;
+       uint64_t ps = 0;
+       psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
+       while (sls != NULL) {
+               /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
+               if (psls != NULL && pn < 16 &&
+                   (pn < 2 || ps < 2 * dmu_prefetch_max)) {
+                       error = space_map_open(&psls->sls_sm,
+                           spa_meta_objset(spa), psls->sls_sm_obj, 0,
+                           UINT64_MAX, SPA_MINBLOCKSHIFT);
+                       if (error != 0) {
+                               spa_load_failed(spa, "spa_ld_log_sm_data(): "
+                                   "failed at space_map_open(obj=%llu) "
+                                   "[error %d]",
+                                   (u_longlong_t)sls->sls_sm_obj, error);
+                               goto out;
+                       }
+                       dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
+                           0, 0, space_map_length(psls->sls_sm),
+                           ZIO_PRIORITY_ASYNC_READ);
+                       pn++;
+                       ps += space_map_length(psls->sls_sm);
+                       psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
+                       continue;
                 }
  
+               /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
+               cond_resched();
+               ASSERT0(sls->sls_nblocks);
+               sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
+               spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+               summary_add_data(spa, sls->sls_txg,
+                   sls->sls_mscount, 0, sls->sls_nblocks);
+
                 struct spa_ld_log_sm_arg vla = {
                         .slls_spa = spa,
                         .slls_txg = sls->sls_txg
                 };
-               error = space_map_iterate(sm, space_map_length(sm),
-                   spa_ld_log_sm_cb, &vla);
+               error = space_map_iterate(sls->sls_sm,
+                   space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
                 if (error != 0) {
-                       space_map_close(sm);
                         spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
                             "at space_map_iterate(obj=%llu) [error %d]",
                             (u_longlong_t)sls->sls_sm_obj, error);
                         goto out;
                 }
  
-               ASSERT0(sls->sls_nblocks);
-               sls->sls_nblocks = space_map_nblocks(sm);
-               spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
-               summary_add_data(spa, sls->sls_txg,
-                   sls->sls_mscount, sls->sls_nblocks);
+               pn--;
+               ps -= space_map_length(sls->sls_sm);
+               space_map_close(sls->sls_sm);
+               sls->sls_sm = NULL;
+               sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
  
-               space_map_close(sm);
+               /* Update log block limits considering just loaded. */
+               spa_log_sm_set_blocklimit(spa);
         }
+
         hrtime_t read_logs_endtime = gethrtime();
         spa_load_note(spa,
             "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
@@ -1157,6 +1221,18 @@ spa_ld_log_sm_data(spa_t *spa)
             (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
  
  out:
+       if (error != 0) {
+               for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+                   sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+                       if (sls->sls_sm) {
+                               space_map_close(sls->sls_sm);
+                               sls->sls_sm = NULL;
+                       }
+               }
+       } else {
+               ASSERT0(pn);
+               ASSERT0(ps);
+       }
         /*
          * Now that the metaslabs contain their unflushed changes:
          * [1] recalculate their actual allocated space
@@ -1237,6 +1313,9 @@ spa_ld_unflushed_txgs(vdev_t *vd)
                 }
  
                 ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+               ms->ms_unflushed_dirty = B_FALSE;
+               ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
+               ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
                 if (ms->ms_unflushed_txg != 0) {
                         mutex_enter(&spa->spa_flushed_ms_lock);
                         avl_add(&spa->spa_metaslabs_by_flushed, ms);
@@ -1300,6 +1379,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
         "Lower-bound limit for the maximum amount of blocks allowed in "
         "log spacemap (see zfs_unflushed_log_block_max)");
  
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW,
+    "Hard limit (upper-bound) in the size of the space map log "
+    "in terms of dirty TXGs.");
+
  ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
         "Tunable used to determine the number of blocks that can be used for "
         "the spacemap log, expressed as a percentage of the total number of "