#include <sys/spa_impl.h>
#include <sys/zfeature.h>
#include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
#define WITH_DF_BLOCK_ALLOCATOR
*/
unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+/*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
#ifdef _METASLAB_TRACING
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
allocated +=
- range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+ range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
- msp_free_space = range_tree_space(msp->ms_tree) + allocated +
- msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+ msp->ms_deferspace + range_tree_space(msp->ms_freed);
VERIFY3U(sm_free_space, ==, msp_free_space);
}
uint64_t
metaslab_block_maxsize(metaslab_t *msp)
{
- avl_tree_t *t = &msp->ms_size_tree;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
range_seg_t *rs;
if (t == NULL || (rs = avl_last(t)) == NULL)
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
- avl_tree_t *t = &msp->ms_tree->rt_root;
+ avl_tree_t *t = &msp->ms_allocatable->rt_root;
return (metaslab_block_picker(t, cursor, size, align));
}
*/
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
- range_tree_t *rt = msp->ms_tree;
+ range_tree_t *rt = msp->ms_allocatable;
avl_tree_t *t = &rt->rt_root;
uint64_t max_size = metaslab_block_maxsize(msp);
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+ ASSERT3U(avl_numnodes(t), ==,
+ avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
*/
if (max_size < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
- t = &msp->ms_size_tree;
+ t = &msp->ms_allocatable_by_size;
*cursor = 0;
}
static uint64_t
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
- range_tree_t *rt = msp->ms_tree;
- avl_tree_t *t = &msp->ms_size_tree;
+ range_tree_t *rt = msp->ms_allocatable;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
uint64_t *cursor = &msp->ms_lbas[0];
uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
if ((*cursor + size) > *cursor_end) {
range_seg_t *rs;
- rs = avl_last(&msp->ms_size_tree);
+ rs = avl_last(&msp->ms_allocatable_by_size);
if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
return (-1ULL);
static uint64_t
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &msp->ms_tree->rt_root;
+ avl_tree_t *t = &msp->ms_allocatable->rt_root;
avl_index_t where;
range_seg_t *rs, rsearch;
uint64_t hbit = highbit64(size);
uint64_t max_size = metaslab_block_maxsize(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+ ASSERT3U(avl_numnodes(t), ==,
+ avl_numnodes(&msp->ms_allocatable_by_size));
if (max_size < size)
return (-1ULL);
rs = avl_find(t, &rsearch, &where);
if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
- t = &msp->ms_size_tree;
+ t = &msp->ms_allocatable_by_size;
rsearch.rs_start = 0;
rsearch.rs_end = MIN(max_size,
/*
* If the space map has not been allocated yet, then treat
- * all the space in the metaslab as free and add it to the
- * ms_tree.
+ * all the space in the metaslab as free and add it to ms_allocatable.
*/
- if (msp->ms_sm != NULL)
- error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
- else
- range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+ if (msp->ms_sm != NULL) {
+ error = space_map_load(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE);
+ } else {
+ range_tree_add(msp->ms_allocatable,
+ msp->ms_start, msp->ms_size);
+ }
success = (error == 0);
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defertree[t],
- range_tree_remove, msp->ms_tree);
+ /*
+ * If the metaslab already has a spacemap, then we need to
+ * remove all segments from the defer tree; otherwise, the
+ * metaslab is completely empty and we can skip this.
+ */
+ if (msp->ms_sm != NULL) {
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
+ }
}
msp->ms_max_size = metaslab_block_maxsize(msp);
}
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
- range_tree_vacate(msp->ms_tree, NULL, NULL);
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
- ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
- metaslab_rangesize_compare, 0);
+ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
+ &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms);
space_map_close(msp->ms_sm);
metaslab_unload(msp);
- range_tree_destroy(msp->ms_tree);
- range_tree_destroy(msp->ms_freeingtree);
- range_tree_destroy(msp->ms_freedtree);
+ range_tree_destroy(msp->ms_allocatable);
+ range_tree_destroy(msp->ms_freeing);
+ range_tree_destroy(msp->ms_freed);
for (int t = 0; t < TXG_SIZE; t++) {
- range_tree_destroy(msp->ms_alloctree[t]);
+ range_tree_destroy(msp->ms_allocating[t]);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_destroy(msp->ms_defertree[t]);
+ range_tree_destroy(msp->ms_defer[t]);
}
-
ASSERT0(msp->ms_deferspace);
+ range_tree_destroy(msp->ms_checkpointing);
+
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
segments <<= 1;
- segments += msp->ms_tree->rt_histogram[i];
+ segments += msp->ms_allocatable->rt_histogram[i];
/*
* The range tree provides more precision than the space map
*/
ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
size >= SPA_MINBLOCKSIZE ||
- range_tree_space(msp->ms_tree) == 0);
+ range_tree_space(msp->ms_allocatable) == 0);
ASSERT0(weight & METASLAB_ACTIVE_MASK);
msp->ms_activation_weight = 0;
range_seg_t *rs;
uint64_t size, entries, segsz, object_size, optimal_size, record_size;
dmu_object_info_t doi;
- uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
+ vdev_t *vd = msp->ms_group->mg_vd;
+ uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+ uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
/*
- * Use the ms_size_tree range tree, which is ordered by size, to
- * obtain the largest segment in the free tree. We always condense
- * metaslabs that are empty and metaslabs for which a condense
- * request has been made.
+ * Allocations and frees in early passes are generally more space
+ * efficient (in terms of blocks described in space map entries)
+ * than the ones in later passes (e.g. we don't compress after
+ * sync pass 5) and condensing a metaslab multiple times in a txg
+ * could degrade performance.
+ *
+ * Thus we prefer condensing each metaslab at most once every txg at
+ * the earliest sync pass possible. If a metaslab is eligible for
+ * condensing again after being considered for condensing within the
+ * same txg, it will hopefully be dirty in the next txg where it will
+ * be condensed at an earlier pass.
+ */
+ if (msp->ms_condense_checked_txg == current_txg)
+ return (B_FALSE);
+ msp->ms_condense_checked_txg = current_txg;
+
+ /*
+ * Use the ms_allocatable_by_size range tree, which is ordered by
+ * size, to obtain the largest segment in the free tree. We always
+ * condense metaslabs that are empty and metaslabs for which a
+ * condense request has been made.
*/
- rs = avl_last(&msp->ms_size_tree);
+ rs = avl_last(&msp->ms_allocatable_by_size);
if (rs == NULL || msp->ms_condense_wanted)
return (B_TRUE);
entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t);
- optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+ optimal_size =
+ sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
object_size = space_map_length(msp->ms_sm);
dmu_object_info_from_db(sm->sm_dbuf, &doi);
space_map_t *sm = msp->ms_sm;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1);
ASSERT(msp->ms_loaded);
"spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
msp->ms_group->mg_vd->vdev_spa->spa_name,
- space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+ space_map_length(msp->ms_sm),
+ avl_numnodes(&msp->ms_allocatable->rt_root),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
condense_tree = range_tree_create(NULL, NULL);
range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
- /*
- * Remove what's been freed in this txg from the condense_tree.
- * Since we're in sync_pass 1, we know that all the frees from
- * this txg are in the freeingtree.
- */
- range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
+ range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+ range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defertree[t],
+ range_tree_walk(msp->ms_defer[t],
range_tree_remove, condense_tree);
}
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
- range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+ range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
range_tree_remove, condense_tree);
}
* metaslab's ms_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
- * for the ms_tree as all other range trees use per txg
+ * for ms_allocatable as all other range trees use per txg
* views of their content.
*/
msp->ms_condensing = B_TRUE;
mutex_exit(&msp->ms_lock);
- space_map_truncate(sm, tx);
+ space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
/*
* While we would ideally like to create a space map representation
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
- space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+ space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
mutex_enter(&msp->ms_lock);
msp->ms_condensing = B_FALSE;
}
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa_meta_objset(spa);
- range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+ range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
dmu_tx_t *tx;
uint64_t object = space_map_object(msp->ms_sm);
/*
* This metaslab has just been added so there's no work to do now.
*/
- if (msp->ms_freeingtree == NULL) {
+ if (msp->ms_freeing == NULL) {
ASSERT3P(alloctree, ==, NULL);
return;
}
ASSERT3P(alloctree, !=, NULL);
- ASSERT3P(msp->ms_freeingtree, !=, NULL);
- ASSERT3P(msp->ms_freedtree, !=, NULL);
+ ASSERT3P(msp->ms_freeing, !=, NULL);
+ ASSERT3P(msp->ms_freed, !=, NULL);
+ ASSERT3P(msp->ms_checkpointing, !=, NULL);
/*
- * Normally, we don't want to process a metaslab if there
- * are no allocations or frees to perform. However, if the metaslab
- * is being forced to condense and it's loaded, we need to let it
- * through.
+ * Normally, we don't want to process a metaslab if there are no
+ * allocations or frees to perform. However, if the metaslab is being
+ * forced to condense and it's loaded, we need to let it through.
*/
- if (range_tree_space(alloctree) == 0 &&
- range_tree_space(msp->ms_freeingtree) == 0 &&
+ if (range_tree_is_empty(alloctree) &&
+ range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing) &&
!(msp->ms_loaded && msp->ms_condense_wanted))
return;
/*
* The only state that can actually be changing concurrently with
- * metaslab_sync() is the metaslab's ms_tree. No other thread can
- * be modifying this txg's alloctree, freeingtree, freedtree, or
- * space_map_phys_t. We drop ms_lock whenever we could call
- * into the DMU, because the DMU can call down to us
+ * metaslab_sync() is the metaslab's ms_allocatable. No other
+ * thread can be modifying this txg's alloc, freeing,
+ * freed, or space_map_phys_t. We drop ms_lock whenever we
+ * could call into the DMU, because the DMU can call down to us
* (e.g. via zio_free()) at any time.
*
* The spa_vdev_remove_thread() can be reading metaslab state
* that the ms_lock is insufficient for this, because it is dropped
* by space_map_write().
*/
-
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (msp->ms_sm == NULL) {
uint64_t new_object;
- new_object = space_map_alloc(mos, tx);
+ new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
VERIFY3U(new_object, !=, 0);
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
ASSERT(msp->ms_sm != NULL);
}
+ if (!range_tree_is_empty(msp->ms_checkpointing) &&
+ vd->vdev_checkpoint_sm == NULL) {
+ ASSERT(spa_has_checkpoint(spa));
+
+ uint64_t new_object = space_map_alloc(mos,
+ vdev_standard_sm_blksz, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+ mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * We save the space map object as an entry in vdev_top_zap
+ * so it can be retrieved when the pool is reopened after an
+ * export or through zdb.
+ */
+ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (new_object), 1, &new_object, tx));
+ }
+
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);
- if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
- metaslab_should_condense(msp)) {
+ if (msp->ms_loaded && metaslab_should_condense(msp)) {
metaslab_condense(msp, txg, tx);
} else {
mutex_exit(&msp->ms_lock);
space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
- space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+ space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
mutex_enter(&msp->ms_lock);
}
+ if (!range_tree_is_empty(msp->ms_checkpointing)) {
+ ASSERT(spa_has_checkpoint(spa));
+ ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+ /*
+ * Since we are doing writes to disk and the ms_checkpointing
+ * tree won't be changing during that time, we drop the
+ * ms_lock while writing to the checkpoint space map.
+ */
+ mutex_exit(&msp->ms_lock);
+ space_map_write(vd->vdev_checkpoint_sm,
+ msp->ms_checkpointing, SM_FREE, tx);
+ mutex_enter(&msp->ms_lock);
+ space_map_update(vd->vdev_checkpoint_sm);
+
+ spa->spa_checkpoint_info.sci_dspace +=
+ range_tree_space(msp->ms_checkpointing);
+ vd->vdev_stat.vs_checkpoint_space +=
+ range_tree_space(msp->ms_checkpointing);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+ -vd->vdev_checkpoint_sm->sm_alloc);
+
+ range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+ }
+
if (msp->ms_loaded) {
/*
* When the space map is loaded, we have an accurate
* it first before updating it.
*/
space_map_histogram_clear(msp->ms_sm);
- space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+ space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
/*
* Since we've cleared the histogram we need to add back
* to accurately reflect all free space even if some space
* is not yet available for allocation (i.e. deferred).
*/
- space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
+ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
/*
* Add back any deferred free space that has not been
*/
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
space_map_histogram_add(msp->ms_sm,
- msp->ms_defertree[t], tx);
+ msp->ms_defer[t], tx);
}
}
* then we will lose some accuracy but will correct it the next
* time we load the space map.
*/
- space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
+ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
/*
* For sync pass 1, we avoid traversing this txg's free range tree
- * and instead will just swap the pointers for freeingtree and
- * freedtree. We can safely do this since the freed_tree is
+ * and instead will just swap the pointers for freeing and
+ * freed. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
- range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
+ range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
} else {
- range_tree_vacate(msp->ms_freeingtree,
- range_tree_add, msp->ms_freedtree);
+ range_tree_vacate(msp->ms_freeing,
+ range_tree_add, msp->ms_freed);
}
range_tree_vacate(alloctree, NULL, NULL);
- ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
- ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
- ASSERT0(range_tree_space(msp->ms_freeingtree));
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+ & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
mutex_exit(&msp->ms_lock);
* If this metaslab is just becoming available, initialize its
* range trees and add its capacity to the vdev.
*/
- if (msp->ms_freedtree == NULL) {
+ if (msp->ms_freed == NULL) {
for (int t = 0; t < TXG_SIZE; t++) {
- ASSERT(msp->ms_alloctree[t] == NULL);
+ ASSERT(msp->ms_allocating[t] == NULL);
- msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
+ msp->ms_allocating[t] = range_tree_create(NULL, NULL);
}
- ASSERT3P(msp->ms_freeingtree, ==, NULL);
- msp->ms_freeingtree = range_tree_create(NULL, NULL);
+ ASSERT3P(msp->ms_freeing, ==, NULL);
+ msp->ms_freeing = range_tree_create(NULL, NULL);
- ASSERT3P(msp->ms_freedtree, ==, NULL);
- msp->ms_freedtree = range_tree_create(NULL, NULL);
+ ASSERT3P(msp->ms_freed, ==, NULL);
+ msp->ms_freed = range_tree_create(NULL, NULL);
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- ASSERT(msp->ms_defertree[t] == NULL);
+ ASSERT(msp->ms_defer[t] == NULL);
- msp->ms_defertree[t] = range_tree_create(NULL, NULL);
+ msp->ms_defer[t] = range_tree_create(NULL, NULL);
}
+ ASSERT3P(msp->ms_checkpointing, ==, NULL);
+ msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
vdev_space_update(vd, 0, 0, msp->ms_size);
}
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
- defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+ defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
metaslab_class_get_alloc(spa_normal_class(spa));
defer_delta = 0;
alloc_delta = space_map_alloc_delta(msp->ms_sm);
if (defer_allowed) {
- defer_delta = range_tree_space(msp->ms_freedtree) -
+ defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
} else {
defer_delta -= range_tree_space(*defer_tree);
/*
* Move the frees from the defer_tree back to the free
- * range tree (if it's loaded). Swap the freed_tree and the
- * defer_tree -- this is safe to do because we've just emptied out
- * the defer_tree.
+ * range tree (if it's loaded). Swap the freed_tree and
+ * the defer_tree -- this is safe to do because we've
+ * just emptied out the defer_tree.
*/
range_tree_vacate(*defer_tree,
- msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+ msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
if (defer_allowed) {
- range_tree_swap(&msp->ms_freedtree, defer_tree);
+ range_tree_swap(&msp->ms_freed, defer_tree);
} else {
- range_tree_vacate(msp->ms_freedtree,
- msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+ range_tree_vacate(msp->ms_freed,
+ msp->ms_loaded ? range_tree_add : NULL,
+ msp->ms_allocatable);
}
-
space_map_update(msp->ms_sm);
msp->ms_deferspace += defer_delta;
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space(
- msp->ms_alloctree[(txg + t) & TXG_MASK]));
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
}
if (!metaslab_debug_unload)
metaslab_unload(msp);
}
- ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
- ASSERT0(range_tree_space(msp->ms_freeingtree));
- ASSERT0(range_tree_space(msp->ms_freedtree));
+ ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_freed));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
mutex_exit(&msp->ms_lock);
}
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
{
uint64_t start;
- range_tree_t *rt = msp->ms_tree;
+ range_tree_t *rt = msp->ms_allocatable;
metaslab_class_t *mc = msp->ms_group->mg_class;
VERIFY(!msp->ms_condensing);
VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
range_tree_remove(rt, start, size);
- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
/* Track the last successful allocation */
msp->ms_alloc_txg = txg;
void
metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
- uint64_t txg)
+ boolean_t checkpoint)
{
metaslab_t *msp;
- ASSERTV(spa_t *spa = vd->vdev_spa);
+ spa_t *spa = vd->vdev_spa;
- ASSERT3U(txg, ==, spa->spa_syncing_txg);
ASSERT(vdev_is_concrete(vd));
ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
metaslab_check_free_impl(vd, offset, asize);
+
mutex_enter(&msp->ms_lock);
- if (range_tree_space(msp->ms_freeingtree) == 0) {
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ if (range_tree_is_empty(msp->ms_freeing) &&
+ range_tree_is_empty(msp->ms_checkpointing)) {
+ vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+ }
+
+ if (checkpoint) {
+ ASSERT(spa_has_checkpoint(spa));
+ range_tree_add(msp->ms_checkpointing, offset, asize);
+ } else {
+ range_tree_add(msp->ms_freeing, offset, asize);
}
- range_tree_add(msp->ms_freeingtree, offset, asize);
mutex_exit(&msp->ms_lock);
}
metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
- uint64_t *txgp = arg;
+ boolean_t *checkpoint = arg;
+
+ ASSERT3P(checkpoint, !=, NULL);
if (vd->vdev_ops->vdev_op_remap != NULL)
- vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+ vdev_indirect_mark_obsolete(vd, offset, size);
else
- metaslab_free_impl(vd, offset, size, *txgp);
+ metaslab_free_impl(vd, offset, size, *checkpoint);
}
static void
metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
- uint64_t txg)
+ boolean_t checkpoint)
{
spa_t *spa = vd->vdev_spa;
ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
- if (txg > spa_freeze_txg(spa))
+ if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
return;
if (spa->spa_vdev_removal != NULL &&
* an indirect vdev (in open context), and then (in syncing
* context) clear spa_vdev_removal.
*/
- free_from_removing_vdev(vd, offset, size, txg);
+ free_from_removing_vdev(vd, offset, size);
} else if (vd->vdev_ops->vdev_op_remap != NULL) {
- vdev_indirect_mark_obsolete(vd, offset, size, txg);
+ vdev_indirect_mark_obsolete(vd, offset, size);
vd->vdev_ops->vdev_op_remap(vd, offset, size,
- metaslab_free_impl_cb, &txg);
+ metaslab_free_impl_cb, &checkpoint);
} else {
- metaslab_free_concrete(vd, offset, size, txg);
+ metaslab_free_concrete(vd, offset, size, checkpoint);
}
}
msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
mutex_enter(&msp->ms_lock);
- range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+ range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
offset, size);
VERIFY(!msp->ms_condensing);
VERIFY3U(offset, >=, msp->ms_start);
VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
- VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+ VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
msp->ms_size);
VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- range_tree_add(msp->ms_tree, offset, size);
+ range_tree_add(msp->ms_allocatable, offset, size);
mutex_exit(&msp->ms_lock);
}
/*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * Free the block represented by the given DVA.
*/
void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
{
uint64_t vdev = DVA_GET_VDEV(dva);
uint64_t offset = DVA_GET_OFFSET(dva);
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
}
- metaslab_free_impl(vd, offset, size, txg);
+ metaslab_free_impl(vd, offset, size, checkpoint);
}
/*
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
- if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+ if (error == 0 &&
+ !range_tree_contains(msp->ms_allocatable, offset, size))
error = SET_ERROR(ENOENT);
if (error || txg == 0) { /* txg == 0 indicates dry run */
VERIFY(!msp->ms_condensing);
VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
- range_tree_remove(msp->ms_tree, offset, size);
+ VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+ msp->ms_size);
+ range_tree_remove(msp->ms_allocatable, offset, size);
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(vd, VDD_METASLAB, msp, txg);
- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+ range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+ offset, size);
}
mutex_exit(&msp->ms_lock);
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+ /*
+ * If we have a checkpoint for the pool we need to make sure that
+ * the blocks that we free that are part of the checkpoint won't be
+ * reused until the checkpoint is discarded or we revert to it.
+ *
+ * The checkpoint flag is passed down the metaslab_free code path
+ * and is set whenever we want to add a block to the checkpoint's
+ * accounting. That is, we "checkpoint" blocks that existed at the
+ * time the checkpoint was created and are therefore referenced by
+ * the checkpointed uberblock.
+ *
+ * Note that, we don't checkpoint any blocks if the current
+ * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+ * normally as they will be referenced by the checkpointed uberblock.
+ */
+ boolean_t checkpoint = B_FALSE;
+ if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+ spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+ /*
+ * At this point, if the block is part of the checkpoint
+ * there is no way it was created in the current txg.
+ */
+ ASSERT(!now);
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
+ checkpoint = B_TRUE;
+ }
+
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
for (int d = 0; d < ndvas; d++) {
if (now) {
metaslab_unalloc_dva(spa, &dva[d], txg);
} else {
- metaslab_free_dva(spa, &dva[d], txg);
+ ASSERT3U(txg, ==, spa_syncing_txg(spa));
+ metaslab_free_dva(spa, &dva[d], checkpoint);
}
}
mutex_enter(&msp->ms_lock);
if (msp->ms_loaded)
- range_tree_verify(msp->ms_tree, offset, size);
+ range_tree_verify(msp->ms_allocatable, offset, size);
- range_tree_verify(msp->ms_freeingtree, offset, size);
- range_tree_verify(msp->ms_freedtree, offset, size);
+ range_tree_verify(msp->ms_freeing, offset, size);
+ range_tree_verify(msp->ms_checkpointing, offset, size);
+ range_tree_verify(msp->ms_freed, offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
- range_tree_verify(msp->ms_defertree[j], offset, size);
+ range_tree_verify(msp->ms_defer[j], offset, size);
mutex_exit(&msp->ms_lock);
}