int32_t ddt_log_flush_rate; /* rolling log flush rate */
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */
+ uint64_t ddt_flush_force_txg; /* flush hard before this txg */
+
enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);
+
+extern void ddt_walk_init(spa_t *spa, uint64_t txg);
+extern boolean_t ddt_walk_ready(spa_t *spa);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
ddt_lightweight_entry_t *ddlwe);
* position on the object even if the object changes, the pool is exported, or
* OpenZFS is upgraded.
*
+ * If the "fast_dedup" feature is enabled and the table has a log, the scan
+ * cannot begin until entries on the log are flushed, as the on-disk log has no
+ * concept of a "stable position". Instead, the log flushing process will enter
+ * a more aggressive mode, to flush out as much as is necesary as soon as
+ * possible, in order to begin the scan as soon as possible.
+ *
* ## Interaction with block cloning
*
* If block cloning and dedup are both enabled on a pool, BRT will look for the
ddt->ddt_flush_min = MAX(
ddt->ddt_log_ingest_rate,
zfs_dedup_log_flush_entries_min);
+
+ /*
+ * If we've been asked to flush everything in a hurry,
+ * try to dump as much as possible on this txg. In
+ * this case we're only limited by time, not amount.
+ */
+ if (ddt->ddt_flush_force_txg > 0)
+ ddt->ddt_flush_min =
+ MAX(ddt->ddt_flush_min, avl_numnodes(
+ &ddt->ddt_log_flushing->ddl_tree));
} else {
/* We already decided we're done for this txg */
return (B_FALSE);
return (ddt->ddt_flush_pass == 0);
}
+static inline void
+ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
+{
+ /*
+ * If we're not forcing flush, and not being asked to start, then
+ * there's nothing more to do.
+ */
+ if (txg == 0) {
+ /* Update requested, are we currently forcing flush? */
+ if (ddt->ddt_flush_force_txg == 0)
+ return;
+ txg = ddt->ddt_flush_force_txg;
+ }
+
+ /*
+ * If either of the logs have entries unflushed entries before
+ * the wanted txg, set the force txg, otherwise clear it.
+ */
+
+ if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
+ ddt->ddt_log_active->ddl_first_txg <= txg) ||
+ (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+ ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
+ ddt->ddt_flush_force_txg = txg;
+ return;
+ }
+
+ /*
+ * Nothing to flush behind the given txg, so we can clear force flush
+ * state.
+ */
+ ddt->ddt_flush_force_txg = 0;
+}
+
static void
ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
{
(void) ddt_log_swap(ddt, tx);
}
+ /* If force flush is no longer necessary, turn it off. */
+ ddt_flush_force_update_txg(ddt, 0);
+
/*
* Update flush rate. This is an exponential weighted moving average of
* the number of entries flushed over recent txgs.
dmu_tx_commit(tx);
}
+void
+ddt_walk_init(spa_t *spa, uint64_t txg)
+{
+ if (txg == 0)
+ txg = spa_syncing_txg(spa);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+ continue;
+
+ ddt_enter(ddt);
+ ddt_flush_force_update_txg(ddt, txg);
+ ddt_exit(ddt);
+ }
+}
+
+boolean_t
+ddt_walk_ready(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+ continue;
+
+ if (ddt->ddt_flush_force_txg > 0)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
int
ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
{
ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
if (ddt == NULL)
continue;
+
+ if (ddt->ddt_flush_force_txg > 0)
+ return (EAGAIN);
+
int error = ENOENT;
if (ddt_object_exists(ddt, ddb->ddb_type,
ddb->ddb_class)) {
zap_cursor_fini(&zc);
}
+ ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
spa_scan_stat_init(spa);
vdev_scan_stat_init(spa->spa_root_vdev);
memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
+ ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
spa_history_log_internal(spa, "scan setup", tx,
txg_sync_waiting(scn->scn_dp) ||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
spa_shutting_down(scn->scn_dp->dp_spa) ||
- (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+ (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
+ !ddt_walk_ready(scn->scn_dp->dp_spa)) {
if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
dprintf("suspending at first available bookmark "
"%llx/%llx/%llx/%llx\n",
break;
}
- zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
- "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
- (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+ if (error == EAGAIN) {
+ dsl_scan_check_suspend(scn, NULL);
+ error = 0;
+
+ zfs_dbgmsg("waiting for ddt to become ready for scan "
+ "on %s with class_max = %u; suspending=%u",
+ scn->scn_dp->dp_spa->spa_name,
+ (int)scn->scn_phys.scn_ddt_class_max,
+ (int)scn->scn_suspending);
+ } else
+ zfs_dbgmsg("scanned %llu ddt entries on %s with "
+ "class_max = %u; suspending=%u", (longlong_t)n,
+ scn->scn_dp->dp_spa->spa_name,
+ (int)scn->scn_phys.scn_ddt_class_max,
+ (int)scn->scn_suspending);
ASSERT(error == 0 || error == ENOENT);
ASSERT(error != ENOENT ||