ddt: block scan until log is flushed, and flush aggressively

author Rob Norris <rob.norris@klarasystems.com>

Mon, 16 Oct 2023 00:52:17 +0000 (11:52 +1100)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 16 Aug 2024 19:03:43 +0000 (12:03 -0700)
author Rob Norris <rob.norris@klarasystems.com>
Mon, 16 Oct 2023 00:52:17 +0000 (11:52 +1100)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 16 Aug 2024 19:03:43 +0000 (12:03 -0700)
diff --git a/include/sys/ddt.h b/include/sys/ddt.h

index 2fc798725eda7a2b660b669010774b26bdab515b..a7920e658062bdbefacba3a1d88093ba9ef8449f 100644 (file)
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -294,6 +294,8 @@ typedef struct {
         int32_t         ddt_log_flush_rate;     /* rolling log flush rate */
         int32_t         ddt_log_flush_time_rate; /* avg time spent flushing */
  
+       uint64_t        ddt_flush_force_txg;    /* flush hard before this txg */
+
         enum zio_checksum ddt_checksum; /* checksum algorithm in use */
         spa_t           *ddt_spa;       /* pool this ddt is on */
         objset_t        *ddt_os;        /* ddt objset (always MOS) */
@@ -393,6 +395,9 @@ extern void ddt_create(spa_t *spa);
  extern int ddt_load(spa_t *spa);
  extern void ddt_unload(spa_t *spa);
  extern void ddt_sync(spa_t *spa, uint64_t txg);
+
+extern void ddt_walk_init(spa_t *spa, uint64_t txg);
+extern boolean_t ddt_walk_ready(spa_t *spa);
  extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
      ddt_lightweight_entry_t *ddlwe);
  
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c

index ce5c4efb51edd0323cac8299ed59116f467e3bed..051005f137bde6b5f66e481a5c9c193c66adbf5a 100644 (file)
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -183,6 +183,12 @@
   * position on the object even if the object changes, the pool is exported, or
   * OpenZFS is upgraded.
   *
+ * If the "fast_dedup" feature is enabled and the table has a log, the scan
+ * cannot begin until entries on the log are flushed, as the on-disk log has no
+ * concept of a "stable position". Instead, the log flushing process will enter
+ * a more aggressive mode, to flush out as much as is necesary as soon as
+ * possible, in order to begin the scan as soon as possible.
+ *
   * ## Interaction with block cloning
   *
   * If block cloning and dedup are both enabled on a pool, BRT will look for the
@@ -1746,6 +1752,16 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
                         ddt->ddt_flush_min = MAX(
                             ddt->ddt_log_ingest_rate,
                             zfs_dedup_log_flush_entries_min);
+
+                       /*
+                        * If we've been asked to flush everything in a hurry,
+                        * try to dump as much as possible on this txg. In
+                        * this case we're only limited by time, not amount.
+                        */
+                       if (ddt->ddt_flush_force_txg > 0)
+                               ddt->ddt_flush_min =
+                                   MAX(ddt->ddt_flush_min, avl_numnodes(
+                                   &ddt->ddt_log_flushing->ddl_tree));
                 } else {
                         /* We already decided we're done for this txg */
                         return (B_FALSE);
@@ -1856,6 +1872,40 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
         return (ddt->ddt_flush_pass == 0);
  }
  
+static inline void
+ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
+{
+       /*
+        * If we're not forcing flush, and not being asked to start, then
+        * there's nothing more to do.
+        */
+       if (txg == 0) {
+               /* Update requested, are we currently forcing flush? */
+               if (ddt->ddt_flush_force_txg == 0)
+                       return;
+               txg = ddt->ddt_flush_force_txg;
+       }
+
+       /*
+        * If either of the logs have entries unflushed entries before
+        * the wanted txg, set the force txg, otherwise clear it.
+        */
+
+       if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
+           ddt->ddt_log_active->ddl_first_txg <= txg) ||
+           (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+           ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
+               ddt->ddt_flush_force_txg = txg;
+               return;
+       }
+
+       /*
+        * Nothing to flush behind the given txg, so we can clear force flush
+        * state.
+        */
+       ddt->ddt_flush_force_txg = 0;
+}
+
  static void
  ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
  {
@@ -1881,6 +1931,9 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
                 (void) ddt_log_swap(ddt, tx);
         }
  
+       /* If force flush is no longer necessary, turn it off. */
+       ddt_flush_force_update_txg(ddt, 0);
+
         /*
          * Update flush rate. This is an exponential weighted moving average of
          * the number of entries flushed over recent txgs.
@@ -2049,6 +2102,38 @@ ddt_sync(spa_t *spa, uint64_t txg)
         dmu_tx_commit(tx);
  }
  
+void
+ddt_walk_init(spa_t *spa, uint64_t txg)
+{
+       if (txg == 0)
+               txg = spa_syncing_txg(spa);
+
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+                       continue;
+
+               ddt_enter(ddt);
+               ddt_flush_force_update_txg(ddt, txg);
+               ddt_exit(ddt);
+       }
+}
+
+boolean_t
+ddt_walk_ready(spa_t *spa)
+{
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+                       continue;
+
+               if (ddt->ddt_flush_force_txg > 0)
+                       return (B_FALSE);
+       }
+
+       return (B_TRUE);
+}
+
  int
  ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
  {
@@ -2058,6 +2143,10 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
                                 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
                                 if (ddt == NULL)
                                         continue;
+
+                               if (ddt->ddt_flush_force_txg > 0)
+                                       return (EAGAIN);
+
                                 int error = ENOENT;
                                 if (ddt_object_exists(ddt, ddb->ddb_type,
                                     ddb->ddb_class)) {
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c

index 7e7ff9e5b89f90971e80fb8200c9a1b9930d8bd8..a367d0cd02f8890e1b731c511a1a875f3b85a2f7 100644 (file)
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@@ -435,7 +435,8 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
         /*
          * Swap policy. We swap the logs (and so begin flushing) when the
          * active tree grows too large, or when we haven't swapped it in
-        * some amount of time.
+        * some amount of time, or if something has requested the logs be
+        * flushed ASAP (see ddt_walk_init()).
          */
  
         /*
@@ -452,7 +453,10 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
             (ddt->ddt_log_active->ddl_first_txg +
             MAX(1, zfs_dedup_log_txg_max));
  
-       if (!(too_large || too_old))
+       const boolean_t force =
+           ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
+
+       if (!(too_large || too_old || force))
                 return (B_FALSE);
  
         ddt_log_t *swap = ddt->ddt_log_active;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index daf1bd5d637b963187195c982bebbd01b482dc0b..9d040e146308a8ab9816f5b8d804e4352247dd21 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                 zap_cursor_fini(&zc);
         }
  
+       ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
         spa_scan_stat_init(spa);
         vdev_scan_stat_init(spa->spa_root_vdev);
  
@@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
  
         memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
  
+       ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
         dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
  
         spa_history_log_internal(spa, "scan setup", tx,
@@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
             txg_sync_waiting(scn->scn_dp) ||
             NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
             spa_shutting_down(scn->scn_dp->dp_spa) ||
-           (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+           (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
+           !ddt_walk_ready(scn->scn_dp->dp_spa)) {
                 if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
                         dprintf("suspending at first available bookmark "
                             "%llx/%llx/%llx/%llx\n",
@@ -3029,9 +3034,21 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
                         break;
         }
  
-       zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
-           "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
-           (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+       if (error == EAGAIN) {
+               dsl_scan_check_suspend(scn, NULL);
+               error = 0;
+
+               zfs_dbgmsg("waiting for ddt to become ready for scan "
+                   "on %s with class_max = %u; suspending=%u",
+                   scn->scn_dp->dp_spa->spa_name,
+                   (int)scn->scn_phys.scn_ddt_class_max,
+                   (int)scn->scn_suspending);
+       } else
+               zfs_dbgmsg("scanned %llu ddt entries on %s with "
+                   "class_max = %u; suspending=%u", (longlong_t)n,
+                   scn->scn_dp->dp_spa->spa_name,
+                   (int)scn->scn_phys.scn_ddt_class_max,
+                   (int)scn->scn_suspending);
  
         ASSERT(error == 0 || error == ENOENT);
         ASSERT(error != ENOENT ||
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh

index a96a38ff178a85e0bc1e6ddfc54b52d1a32a6a75..474f41eae8f3b9d793f076b0ebb9382d39b95005 100755 (executable)
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
@@ -95,6 +95,10 @@ while (( i < 16384 )); do
         done
         ((i += 1))
  done
+
+# Force the DDT logs to disk with a scrub so they can be prefetched
+log_must zpool scrub -w $TESTPOOL
+
  log_note "Dataset generation completed."
  
  typeset -A generated
author	Rob Norris <rob.norris@klarasystems.com>
	Mon, 16 Oct 2023 00:52:17 +0000 (11:52 +1100)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 16 Aug 2024 19:03:43 +0000 (12:03 -0700)
include/sys/ddt.h		patch \| blob \| blame \| history
module/zfs/ddt.c		patch \| blob \| blame \| history
module/zfs/ddt_log.c		patch \| blob \| blame \| history
module/zfs/dsl_scan.c		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh		patch \| blob \| blame \| history