* terms of performance. Thus we have a hard limit in the size of the log in
* terms of blocks.
*/
-static unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+static unsigned long zfs_unflushed_log_block_max = (1ULL << 17);
+
+/*
+ * Also we have a hard limit in the size of the log in terms of dirty TXGs.
+ */
+static unsigned long zfs_unflushed_log_txg_max = 1000;
/*
* Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
return;
}
- uint64_t calculated_limit =
- (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
- spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+ uint64_t msdcount = 0;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e))
+ msdcount += e->lse_msdcount;
+
+ uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
+ spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
}
}
static boolean_t
-summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
{
+ if (e->lse_end == txg)
+ return (0);
+ if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
+ zfs_max_logsm_summary_length))
+ return (1);
uint64_t blocks_per_row = MAX(1,
DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
zfs_max_logsm_summary_length));
* the metaslab.
*/
void
-spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
{
/*
* We don't track summary data for read-only pools and this function
}
target->lse_mscount--;
+ if (dirty)
+ target->lse_msdcount--;
}
/*
void
spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
{
- for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
- e != NULL; e = list_head(&spa->spa_log_summary)) {
+ log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ if (e->lse_txgcount > 0)
+ e->lse_txgcount--;
+ for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
if (e->lse_blkcount > blocks_gone) {
/*
* Assert that we stopped at an entry that is not
static void
summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
- uint64_t nblocks)
+ uint64_t metaslabs_dirty, uint64_t nblocks)
{
log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
- if (e == NULL || summary_entry_is_full(spa, e)) {
+ if (e == NULL || summary_entry_is_full(spa, e, txg)) {
e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
- e->lse_start = txg;
+ e->lse_start = e->lse_end = txg;
+ e->lse_txgcount = 1;
list_insert_tail(&spa->spa_log_summary, e);
}
ASSERT3U(e->lse_start, <=, txg);
+ if (e->lse_end < txg) {
+ e->lse_end = txg;
+ e->lse_txgcount++;
+ }
e->lse_mscount += metaslabs_flushed;
+ e->lse_msdcount += metaslabs_dirty;
e->lse_blkcount += nblocks;
}
static void
spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
{
- summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+ summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
}
void
-spa_log_summary_add_flushed_metaslab(spa_t *spa)
+spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
{
- summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+ summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
+}
+
+void
+spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
+{
+ log_summary_entry_t *target = NULL;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+ if (e->lse_start > txg)
+ break;
+ target = e;
+ }
+ ASSERT3P(target, !=, NULL);
+ ASSERT3U(target->lse_mscount, !=, 0);
+ target->lse_msdcount++;
}
/*
int64_t available_blocks =
spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
+ int64_t available_txgs = zfs_unflushed_log_txg_max;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e))
+ available_txgs -= e->lse_txgcount;
+
/*
* This variable tells us the total number of flushes needed to
* keep the log size within the limit when we reach txgs_in_future.
uint64_t total_flushes = 0;
/* Holds the current maximum of our estimates so far. */
- uint64_t max_flushes_pertxg =
- MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
- zfs_min_metaslabs_to_flush);
+ uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
/*
* For our estimations we only look as far in the future
* then keep skipping TXGs accumulating more blocks
* based on the incoming rate until we exceed it.
*/
- if (available_blocks >= 0) {
- uint64_t skip_txgs = (available_blocks / incoming) + 1;
+ if (available_blocks >= 0 && available_txgs >= 0) {
+ uint64_t skip_txgs = MIN(available_txgs + 1,
+ (available_blocks / incoming) + 1);
available_blocks -= (skip_txgs * incoming);
+ available_txgs -= skip_txgs;
txgs_in_future += skip_txgs;
ASSERT3S(available_blocks, >=, -incoming);
+ ASSERT3S(available_txgs, >=, -1);
}
/*
* based on the current entry in the summary, updating
* our available_blocks.
*/
- ASSERT3S(available_blocks, <, 0);
+ ASSERT(available_blocks < 0 || available_txgs < 0);
available_blocks += e->lse_blkcount;
- total_flushes += e->lse_mscount;
+ available_txgs += e->lse_txgcount;
+ total_flushes += e->lse_msdcount;
/*
* Keep the running maximum of the total_flushes that
*/
max_flushes_pertxg = MAX(max_flushes_pertxg,
DIV_ROUND_UP(total_flushes, txgs_in_future));
- ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
- max_flushes_pertxg);
}
return (max_flushes_pertxg);
}
uint64_t want_to_flush;
if (spa_flush_all_logs_requested(spa)) {
ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
- want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+ want_to_flush = UINT64_MAX;
} else {
want_to_flush = spa_estimate_metaslabs_to_flush(spa);
}
- ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
- want_to_flush);
-
/* Used purely for verification purposes */
uint64_t visited = 0;
if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
break;
- mutex_enter(&curr->ms_sync_lock);
- mutex_enter(&curr->ms_lock);
- boolean_t flushed = metaslab_flush(curr, tx);
- mutex_exit(&curr->ms_lock);
- mutex_exit(&curr->ms_sync_lock);
-
- /*
- * If we failed to flush a metaslab (because it was loading),
- * then we are done with the block heuristic as it's not
- * possible to destroy any log space maps once you've skipped
- * a metaslab. In that case we just set our counter to 0 but
- * we continue looping in case there is still memory pressure
- * due to unflushed changes. Note that, flushing a metaslab
- * that is not the oldest flushed in the pool, will never
- * destroy any log space maps [see spa_cleanup_old_sm_logs()].
- */
- if (!flushed) {
- want_to_flush = 0;
- } else if (want_to_flush > 0) {
- want_to_flush--;
- }
+ if (metaslab_unflushed_dirty(curr)) {
+ mutex_enter(&curr->ms_sync_lock);
+ mutex_enter(&curr->ms_lock);
+ metaslab_flush(curr, tx);
+ mutex_exit(&curr->ms_lock);
+ mutex_exit(&curr->ms_sync_lock);
+ if (want_to_flush > 0)
+ want_to_flush--;
+ } else
+ metaslab_unflushed_bump(curr, tx, B_FALSE);
visited++;
}
ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+
+ spa_log_sm_set_blocklimit(spa);
}
/*
avl_remove(&spa->spa_sm_logs_by_txg, sls);
space_map_free_obj(mos, sls->sls_sm_obj, tx);
VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+ spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
kmem_free(sls, sizeof (spa_log_sm_t));
}
VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
0, UINT64_MAX, SPA_MINBLOCKSHIFT));
- /*
- * If the log space map feature was just enabled, the blocklimit
- * has not yet been set.
- */
- if (spa_log_sm_blocklimit(spa) == 0)
- spa_log_sm_set_blocklimit(spa);
+ spa_log_sm_set_blocklimit(spa);
}
/*
panic("invalid maptype_t");
break;
}
+ if (!metaslab_unflushed_dirty(ms)) {
+ metaslab_set_unflushed_dirty(ms, B_TRUE);
+ spa_log_summary_dirty_flushed_metaslab(spa,
+ metaslab_unflushed_txg(ms));
+ }
return (0);
}
static int
spa_ld_log_sm_data(spa_t *spa)
{
+ spa_log_sm_t *sls, *psls;
int error = 0;
/*
ASSERT0(spa->spa_unflushed_stats.sus_memused);
hrtime_t read_logs_starttime = gethrtime();
- /* this is a no-op when we don't have space map logs */
- for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
- sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
- space_map_t *sm = NULL;
- error = space_map_open(&sm, spa_meta_objset(spa),
- sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
- if (error != 0) {
- spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
- "space_map_open(obj=%llu) [error %d]",
- (u_longlong_t)sls->sls_sm_obj, error);
- goto out;
+
+ /* Prefetch log spacemaps dnodes. */
+ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
+ sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
+ 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ }
+
+ uint_t pn = 0;
+ uint64_t ps = 0;
+ psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
+ while (sls != NULL) {
+ /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
+ if (psls != NULL && pn < 16 &&
+ (pn < 2 || ps < 2 * dmu_prefetch_max)) {
+ error = space_map_open(&psls->sls_sm,
+ spa_meta_objset(spa), psls->sls_sm_obj, 0,
+ UINT64_MAX, SPA_MINBLOCKSHIFT);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_ld_log_sm_data(): "
+ "failed at space_map_open(obj=%llu) "
+ "[error %d]",
+ (u_longlong_t)sls->sls_sm_obj, error);
+ goto out;
+ }
+ dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
+ 0, 0, space_map_length(psls->sls_sm),
+ ZIO_PRIORITY_ASYNC_READ);
+ pn++;
+ ps += space_map_length(psls->sls_sm);
+ psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
+ continue;
}
+ /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
+ cond_resched();
+ ASSERT0(sls->sls_nblocks);
+ sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
+ spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+ summary_add_data(spa, sls->sls_txg,
+ sls->sls_mscount, 0, sls->sls_nblocks);
+
struct spa_ld_log_sm_arg vla = {
.slls_spa = spa,
.slls_txg = sls->sls_txg
};
- error = space_map_iterate(sm, space_map_length(sm),
- spa_ld_log_sm_cb, &vla);
+ error = space_map_iterate(sls->sls_sm,
+ space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
if (error != 0) {
- space_map_close(sm);
spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
"at space_map_iterate(obj=%llu) [error %d]",
(u_longlong_t)sls->sls_sm_obj, error);
goto out;
}
- ASSERT0(sls->sls_nblocks);
- sls->sls_nblocks = space_map_nblocks(sm);
- spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
- summary_add_data(spa, sls->sls_txg,
- sls->sls_mscount, sls->sls_nblocks);
+ pn--;
+ ps -= space_map_length(sls->sls_sm);
+ space_map_close(sls->sls_sm);
+ sls->sls_sm = NULL;
+ sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
- space_map_close(sm);
+ /* Update log block limits considering just loaded. */
+ spa_log_sm_set_blocklimit(spa);
}
+
hrtime_t read_logs_endtime = gethrtime();
spa_load_note(spa,
"read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
(longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
out:
+ if (error != 0) {
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ if (sls->sls_sm) {
+ space_map_close(sls->sls_sm);
+ sls->sls_sm = NULL;
+ }
+ }
+ } else {
+ ASSERT0(pn);
+ ASSERT0(ps);
+ }
/*
* Now that the metaslabs contain their unflushed changes:
* [1] recalculate their actual allocated space
}
ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+ ms->ms_unflushed_dirty = B_FALSE;
+ ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
if (ms->ms_unflushed_txg != 0) {
mutex_enter(&spa->spa_flushed_ms_lock);
avl_add(&spa->spa_metaslabs_by_flushed, ms);
"Lower-bound limit for the maximum amount of blocks allowed in "
"log spacemap (see zfs_unflushed_log_block_max)");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW,
+ "Hard limit (upper-bound) in the size of the space map log "
+ "in terms of dirty TXGs.");
+
ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
"Tunable used to determine the number of blocks that can be used for "
"the spacemap log, expressed as a percentage of the total number of "