]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/metaslab.c
OpenZFS 9166 - zfs storage pool checkpoint
[mirror_zfs.git] / module / zfs / metaslab.c
index c11e459e0f0bfd9290359ec9ac61270ece95cef0..76fa99e8b1f664f592dccb6687ee9ab50d9e4c32 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
 
 #define        WITH_DF_BLOCK_ALLOCATOR
 
@@ -53,6 +54,14 @@ unsigned long metaslab_aliquot = 512 << 10;
  */
 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
+/*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
@@ -211,7 +220,7 @@ uint64_t metaslab_trace_max_entries = 5000;
 
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 #ifdef _METASLAB_TRACING
@@ -484,11 +493,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
         */
        for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
                allocated +=
-                   range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+                   range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
        }
 
-       msp_free_space = range_tree_space(msp->ms_tree) + allocated +
-           msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
+       msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+           msp->ms_deferspace + range_tree_space(msp->ms_freed);
 
        VERIFY3U(sm_free_space, ==, msp_free_space);
 }
@@ -1021,7 +1030,7 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 uint64_t
 metaslab_block_maxsize(metaslab_t *msp)
 {
-       avl_tree_t *t = &msp->ms_size_tree;
+       avl_tree_t *t = &msp->ms_allocatable_by_size;
        range_seg_t *rs;
 
        if (t == NULL || (rs = avl_last(t)) == NULL)
@@ -1101,7 +1110,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
         */
        uint64_t align = size & -size;
        uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-       avl_tree_t *t = &msp->ms_tree->rt_root;
+       avl_tree_t *t = &msp->ms_allocatable->rt_root;
 
        return (metaslab_block_picker(t, cursor, size, align));
 }
@@ -1134,13 +1143,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
         */
        uint64_t align = size & -size;
        uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-       range_tree_t *rt = msp->ms_tree;
+       range_tree_t *rt = msp->ms_allocatable;
        avl_tree_t *t = &rt->rt_root;
        uint64_t max_size = metaslab_block_maxsize(msp);
        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+       ASSERT3U(avl_numnodes(t), ==,
+           avl_numnodes(&msp->ms_allocatable_by_size));
 
        if (max_size < size)
                return (-1ULL);
@@ -1151,7 +1161,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
         */
        if (max_size < metaslab_df_alloc_threshold ||
            free_pct < metaslab_df_free_pct) {
-               t = &msp->ms_size_tree;
+               t = &msp->ms_allocatable_by_size;
                *cursor = 0;
        }
 
@@ -1178,8 +1188,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-       range_tree_t *rt = msp->ms_tree;
-       avl_tree_t *t = &msp->ms_size_tree;
+       range_tree_t *rt = msp->ms_allocatable;
+       avl_tree_t *t = &msp->ms_allocatable_by_size;
        uint64_t *cursor = &msp->ms_lbas[0];
        uint64_t *cursor_end = &msp->ms_lbas[1];
        uint64_t offset = 0;
@@ -1192,7 +1202,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
        if ((*cursor + size) > *cursor_end) {
                range_seg_t *rs;
 
-               rs = avl_last(&msp->ms_size_tree);
+               rs = avl_last(&msp->ms_allocatable_by_size);
                if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
                        return (-1ULL);
 
@@ -1232,7 +1242,7 @@ uint64_t metaslab_ndf_clump_shift = 4;
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-       avl_tree_t *t = &msp->ms_tree->rt_root;
+       avl_tree_t *t = &msp->ms_allocatable->rt_root;
        avl_index_t where;
        range_seg_t *rs, rsearch;
        uint64_t hbit = highbit64(size);
@@ -1240,7 +1250,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
        uint64_t max_size = metaslab_block_maxsize(msp);
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+       ASSERT3U(avl_numnodes(t), ==,
+           avl_numnodes(&msp->ms_allocatable_by_size));
 
        if (max_size < size)
                return (-1ULL);
@@ -1250,7 +1261,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 
        rs = avl_find(t, &rsearch, &where);
        if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
-               t = &msp->ms_size_tree;
+               t = &msp->ms_allocatable_by_size;
 
                rsearch.rs_start = 0;
                rsearch.rs_end = MIN(max_size,
@@ -1316,13 +1327,15 @@ metaslab_load(metaslab_t *msp)
 
        /*
         * If the space map has not been allocated yet, then treat
-        * all the space in the metaslab as free and add it to the
-        * ms_tree.
+        * all the space in the metaslab as free and add it to ms_allocatable.
         */
-       if (msp->ms_sm != NULL)
-               error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
-       else
-               range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+       if (msp->ms_sm != NULL) {
+               error = space_map_load(msp->ms_sm, msp->ms_allocatable,
+                   SM_FREE);
+       } else {
+               range_tree_add(msp->ms_allocatable,
+                   msp->ms_start, msp->ms_size);
+       }
 
        success = (error == 0);
 
@@ -1333,9 +1346,16 @@ metaslab_load(metaslab_t *msp)
                ASSERT3P(msp->ms_group, !=, NULL);
                msp->ms_loaded = B_TRUE;
 
-               for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-                       range_tree_walk(msp->ms_defertree[t],
-                           range_tree_remove, msp->ms_tree);
+               /*
+                * If the metaslab already has a spacemap, then we need to
+                * remove all segments from the defer tree; otherwise, the
+                * metaslab is completely empty and we can skip this.
+                */
+               if (msp->ms_sm != NULL) {
+                       for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+                               range_tree_walk(msp->ms_defer[t],
+                                   range_tree_remove, msp->ms_allocatable);
+                       }
                }
                msp->ms_max_size = metaslab_block_maxsize(msp);
        }
@@ -1347,7 +1367,7 @@ void
 metaslab_unload(metaslab_t *msp)
 {
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       range_tree_vacate(msp->ms_tree, NULL, NULL);
+       range_tree_vacate(msp->ms_allocatable, NULL, NULL);
        msp->ms_loaded = B_FALSE;
        msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
        msp->ms_max_size = 0;
@@ -1393,8 +1413,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
         * addition of new space; and for debugging, it ensures that we'd
         * data fault on any attempt to use this metaslab before it's ready.
         */
-       ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
-           metaslab_rangesize_compare, 0);
+       ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
+           &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
        metaslab_group_add(mg, ms);
 
        metaslab_set_fragmentation(ms);
@@ -1446,20 +1466,21 @@ metaslab_fini(metaslab_t *msp)
        space_map_close(msp->ms_sm);
 
        metaslab_unload(msp);
-       range_tree_destroy(msp->ms_tree);
-       range_tree_destroy(msp->ms_freeingtree);
-       range_tree_destroy(msp->ms_freedtree);
+       range_tree_destroy(msp->ms_allocatable);
+       range_tree_destroy(msp->ms_freeing);
+       range_tree_destroy(msp->ms_freed);
 
        for (int t = 0; t < TXG_SIZE; t++) {
-               range_tree_destroy(msp->ms_alloctree[t]);
+               range_tree_destroy(msp->ms_allocating[t]);
        }
 
        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-               range_tree_destroy(msp->ms_defertree[t]);
+               range_tree_destroy(msp->ms_defer[t]);
        }
-
        ASSERT0(msp->ms_deferspace);
 
+       range_tree_destroy(msp->ms_checkpointing);
+
        mutex_exit(&msp->ms_lock);
        cv_destroy(&msp->ms_load_cv);
        mutex_destroy(&msp->ms_lock);
@@ -1679,7 +1700,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
                int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
                segments <<= 1;
-               segments += msp->ms_tree->rt_histogram[i];
+               segments += msp->ms_allocatable->rt_histogram[i];
 
                /*
                 * The range tree provides more precision than the space map
@@ -1895,7 +1916,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
         */
        ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
            size >= SPA_MINBLOCKSIZE ||
-           range_tree_space(msp->ms_tree) == 0);
+           range_tree_space(msp->ms_allocatable) == 0);
        ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
        msp->ms_activation_weight = 0;
@@ -2028,18 +2049,37 @@ metaslab_should_condense(metaslab_t *msp)
        range_seg_t *rs;
        uint64_t size, entries, segsz, object_size, optimal_size, record_size;
        dmu_object_info_t doi;
-       uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
+       vdev_t *vd = msp->ms_group->mg_vd;
+       uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+       uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
        ASSERT(msp->ms_loaded);
 
        /*
-        * Use the ms_size_tree range tree, which is ordered by size, to
-        * obtain the largest segment in the free tree. We always condense
-        * metaslabs that are empty and metaslabs for which a condense
-        * request has been made.
+        * Allocations and frees in early passes are generally more space
+        * efficient (in terms of blocks described in space map entries)
+        * than the ones in later passes (e.g. we don't compress after
+        * sync pass 5) and condensing a metaslab multiple times in a txg
+        * could degrade performance.
+        *
+        * Thus we prefer condensing each metaslab at most once every txg at
+        * the earliest sync pass possible. If a metaslab is eligible for
+        * condensing again after being considered for condensing within the
+        * same txg, it will hopefully be dirty in the next txg where it will
+        * be condensed at an earlier pass.
+        */
+       if (msp->ms_condense_checked_txg == current_txg)
+               return (B_FALSE);
+       msp->ms_condense_checked_txg = current_txg;
+
+       /*
+        * Use the ms_allocatable_by_size range tree, which is ordered by
+        * size, to obtain the largest segment in the free tree. We always
+        * condense metaslabs that are empty and metaslabs for which a
+        * condense request has been made.
         */
-       rs = avl_last(&msp->ms_size_tree);
+       rs = avl_last(&msp->ms_allocatable_by_size);
        if (rs == NULL || msp->ms_condense_wanted)
                return (B_TRUE);
 
@@ -2053,7 +2093,8 @@ metaslab_should_condense(metaslab_t *msp)
        entries = size / (MIN(size, SM_RUN_MAX));
        segsz = entries * sizeof (uint64_t);
 
-       optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+       optimal_size =
+           sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
        object_size = space_map_length(msp->ms_sm);
 
        dmu_object_info_from_db(sm->sm_dbuf, &doi);
@@ -2076,7 +2117,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
        space_map_t *sm = msp->ms_sm;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1);
        ASSERT(msp->ms_loaded);
 
 
@@ -2084,7 +2124,8 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
            "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
            msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
            msp->ms_group->mg_vd->vdev_spa->spa_name,
-           space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+           space_map_length(msp->ms_sm),
+           avl_numnodes(&msp->ms_allocatable->rt_root),
            msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
        msp->ms_condense_wanted = B_FALSE;
@@ -2099,20 +2140,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
        condense_tree = range_tree_create(NULL, NULL);
        range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
-       /*
-        * Remove what's been freed in this txg from the condense_tree.
-        * Since we're in sync_pass 1, we know that all the frees from
-        * this txg are in the freeingtree.
-        */
-       range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
+       range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+       range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
 
        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-               range_tree_walk(msp->ms_defertree[t],
+               range_tree_walk(msp->ms_defer[t],
                    range_tree_remove, condense_tree);
        }
 
        for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-               range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+               range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
                    range_tree_remove, condense_tree);
        }
 
@@ -2122,13 +2159,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
         * metaslab's ms_condensing flag to ensure that
         * allocations on this metaslab do not occur while we're
         * in the middle of committing it to disk. This is only critical
-        * for the ms_tree as all other range trees use per txg
+        * for ms_allocatable as all other range trees use per txg
         * views of their content.
         */
        msp->ms_condensing = B_TRUE;
 
        mutex_exit(&msp->ms_lock);
-       space_map_truncate(sm, tx);
+       space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
 
        /*
         * While we would ideally like to create a space map representation
@@ -2144,7 +2181,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
        range_tree_vacate(condense_tree, NULL, NULL);
        range_tree_destroy(condense_tree);
 
-       space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+       space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
        mutex_enter(&msp->ms_lock);
        msp->ms_condensing = B_FALSE;
 }
@@ -2159,7 +2196,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        vdev_t *vd = mg->mg_vd;
        spa_t *spa = vd->vdev_spa;
        objset_t *mos = spa_meta_objset(spa);
-       range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+       range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
        dmu_tx_t *tx;
        uint64_t object = space_map_object(msp->ms_sm);
 
@@ -2168,23 +2205,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        /*
         * This metaslab has just been added so there's no work to do now.
         */
-       if (msp->ms_freeingtree == NULL) {
+       if (msp->ms_freeing == NULL) {
                ASSERT3P(alloctree, ==, NULL);
                return;
        }
 
        ASSERT3P(alloctree, !=, NULL);
-       ASSERT3P(msp->ms_freeingtree, !=, NULL);
-       ASSERT3P(msp->ms_freedtree, !=, NULL);
+       ASSERT3P(msp->ms_freeing, !=, NULL);
+       ASSERT3P(msp->ms_freed, !=, NULL);
+       ASSERT3P(msp->ms_checkpointing, !=, NULL);
 
        /*
-        * Normally, we don't want to process a metaslab if there
-        * are no allocations or frees to perform. However, if the metaslab
-        * is being forced to condense and it's loaded, we need to let it
-        * through.
+        * Normally, we don't want to process a metaslab if there are no
+        * allocations or frees to perform. However, if the metaslab is being
+        * forced to condense and it's loaded, we need to let it through.
         */
-       if (range_tree_space(alloctree) == 0 &&
-           range_tree_space(msp->ms_freeingtree) == 0 &&
+       if (range_tree_is_empty(alloctree) &&
+           range_tree_is_empty(msp->ms_freeing) &&
+           range_tree_is_empty(msp->ms_checkpointing) &&
            !(msp->ms_loaded && msp->ms_condense_wanted))
                return;
 
@@ -2193,10 +2231,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
        /*
         * The only state that can actually be changing concurrently with
-        * metaslab_sync() is the metaslab's ms_tree.  No other thread can
-        * be modifying this txg's alloctree, freeingtree, freedtree, or
-        * space_map_phys_t.  We drop ms_lock whenever we could call
-        * into the DMU, because the DMU can call down to us
+        * metaslab_sync() is the metaslab's ms_allocatable.  No other
+        * thread can be modifying this txg's alloc, freeing,
+        * freed, or space_map_phys_t.  We drop ms_lock whenever we
+        * could call into the DMU, because the DMU can call down to us
         * (e.g. via zio_free()) at any time.
         *
         * The spa_vdev_remove_thread() can be reading metaslab state
@@ -2204,13 +2242,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         * that the ms_lock is insufficient for this, because it is dropped
         * by space_map_write().
         */
-
        tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
        if (msp->ms_sm == NULL) {
                uint64_t new_object;
 
-               new_object = space_map_alloc(mos, tx);
+               new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
                VERIFY3U(new_object, !=, 0);
 
                VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
@@ -2218,6 +2255,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                ASSERT(msp->ms_sm != NULL);
        }
 
+       if (!range_tree_is_empty(msp->ms_checkpointing) &&
+           vd->vdev_checkpoint_sm == NULL) {
+               ASSERT(spa_has_checkpoint(spa));
+
+               uint64_t new_object = space_map_alloc(mos,
+                   vdev_standard_sm_blksz, tx);
+               VERIFY3U(new_object, !=, 0);
+
+               VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+                   mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+               ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+               /*
+                * We save the space map object as an entry in vdev_top_zap
+                * so it can be retrieved when the pool is reopened after an
+                * export or through zdb.
+                */
+               VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+                   vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+                   sizeof (new_object), 1, &new_object, tx));
+       }
+
        mutex_enter(&msp->ms_sync_lock);
        mutex_enter(&msp->ms_lock);
 
@@ -2230,16 +2289,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        metaslab_class_histogram_verify(mg->mg_class);
        metaslab_group_histogram_remove(mg, msp);
 
-       if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
-           metaslab_should_condense(msp)) {
+       if (msp->ms_loaded && metaslab_should_condense(msp)) {
                metaslab_condense(msp, txg, tx);
        } else {
                mutex_exit(&msp->ms_lock);
                space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-               space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+               space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
                mutex_enter(&msp->ms_lock);
        }
 
+       if (!range_tree_is_empty(msp->ms_checkpointing)) {
+               ASSERT(spa_has_checkpoint(spa));
+               ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+               /*
+                * Since we are doing writes to disk and the ms_checkpointing
+                * tree won't be changing during that time, we drop the
+                * ms_lock while writing to the checkpoint space map.
+                */
+               mutex_exit(&msp->ms_lock);
+               space_map_write(vd->vdev_checkpoint_sm,
+                   msp->ms_checkpointing, SM_FREE, tx);
+               mutex_enter(&msp->ms_lock);
+               space_map_update(vd->vdev_checkpoint_sm);
+
+               spa->spa_checkpoint_info.sci_dspace +=
+                   range_tree_space(msp->ms_checkpointing);
+               vd->vdev_stat.vs_checkpoint_space +=
+                   range_tree_space(msp->ms_checkpointing);
+               ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+                   -vd->vdev_checkpoint_sm->sm_alloc);
+
+               range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+       }
+
        if (msp->ms_loaded) {
                /*
                 * When the space map is loaded, we have an accurate
@@ -2248,7 +2331,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 * it first before updating it.
                 */
                space_map_histogram_clear(msp->ms_sm);
-               space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+               space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
                /*
                 * Since we've cleared the histogram we need to add back
@@ -2257,7 +2340,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 * to accurately reflect all free space even if some space
                 * is not yet available for allocation (i.e. deferred).
                 */
-               space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
+               space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
                /*
                 * Add back any deferred free space that has not been
@@ -2268,7 +2351,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 */
                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
                        space_map_histogram_add(msp->ms_sm,
-                           msp->ms_defertree[t], tx);
+                           msp->ms_defer[t], tx);
                }
        }
 
@@ -2279,7 +2362,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         * then we will lose some accuracy but will correct it the next
         * time we load the space map.
         */
-       space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
+       space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 
        metaslab_group_histogram_add(mg, msp);
        metaslab_group_histogram_verify(mg);
@@ -2287,21 +2370,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
        /*
         * For sync pass 1, we avoid traversing this txg's free range tree
-        * and instead will just swap the pointers for freeingtree and
-        * freedtree. We can safely do this since the freed_tree is
+        * and instead will just swap the pointers for freeing and
+        * freed. We can safely do this since the freed_tree is
         * guaranteed to be empty on the initial pass.
         */
        if (spa_sync_pass(spa) == 1) {
-               range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
+               range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
        } else {
-               range_tree_vacate(msp->ms_freeingtree,
-                   range_tree_add, msp->ms_freedtree);
+               range_tree_vacate(msp->ms_freeing,
+                   range_tree_add, msp->ms_freed);
        }
        range_tree_vacate(alloctree, NULL, NULL);
 
-       ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-       ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
-       ASSERT0(range_tree_space(msp->ms_freeingtree));
+       ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+       ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+           & TXG_MASK]));
+       ASSERT0(range_tree_space(msp->ms_freeing));
+       ASSERT0(range_tree_space(msp->ms_checkpointing));
 
        mutex_exit(&msp->ms_lock);
 
@@ -2336,29 +2421,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         * If this metaslab is just becoming available, initialize its
         * range trees and add its capacity to the vdev.
         */
-       if (msp->ms_freedtree == NULL) {
+       if (msp->ms_freed == NULL) {
                for (int t = 0; t < TXG_SIZE; t++) {
-                       ASSERT(msp->ms_alloctree[t] == NULL);
+                       ASSERT(msp->ms_allocating[t] == NULL);
 
-                       msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
+                       msp->ms_allocating[t] = range_tree_create(NULL, NULL);
                }
 
-               ASSERT3P(msp->ms_freeingtree, ==, NULL);
-               msp->ms_freeingtree = range_tree_create(NULL, NULL);
+               ASSERT3P(msp->ms_freeing, ==, NULL);
+               msp->ms_freeing = range_tree_create(NULL, NULL);
 
-               ASSERT3P(msp->ms_freedtree, ==, NULL);
-               msp->ms_freedtree = range_tree_create(NULL, NULL);
+               ASSERT3P(msp->ms_freed, ==, NULL);
+               msp->ms_freed = range_tree_create(NULL, NULL);
 
                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-                       ASSERT(msp->ms_defertree[t] == NULL);
+                       ASSERT(msp->ms_defer[t] == NULL);
 
-                       msp->ms_defertree[t] = range_tree_create(NULL, NULL);
+                       msp->ms_defer[t] = range_tree_create(NULL, NULL);
                }
 
+               ASSERT3P(msp->ms_checkpointing, ==, NULL);
+               msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
                vdev_space_update(vd, 0, 0, msp->ms_size);
        }
+       ASSERT0(range_tree_space(msp->ms_freeing));
+       ASSERT0(range_tree_space(msp->ms_checkpointing));
 
-       defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+       defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
        uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
            metaslab_class_get_alloc(spa_normal_class(spa));
@@ -2369,7 +2459,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
        defer_delta = 0;
        alloc_delta = space_map_alloc_delta(msp->ms_sm);
        if (defer_allowed) {
-               defer_delta = range_tree_space(msp->ms_freedtree) -
+               defer_delta = range_tree_space(msp->ms_freed) -
                    range_tree_space(*defer_tree);
        } else {
                defer_delta -= range_tree_space(*defer_tree);
@@ -2385,19 +2475,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
        /*
         * Move the frees from the defer_tree back to the free
-        * range tree (if it's loaded). Swap the freed_tree and the
-        * defer_tree -- this is safe to do because we've just emptied out
-        * the defer_tree.
+        * range tree (if it's loaded). Swap the freed_tree and
+        * the defer_tree -- this is safe to do because we've
+        * just emptied out the defer_tree.
         */
        range_tree_vacate(*defer_tree,
-           msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+           msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
        if (defer_allowed) {
-               range_tree_swap(&msp->ms_freedtree, defer_tree);
+               range_tree_swap(&msp->ms_freed, defer_tree);
        } else {
-               range_tree_vacate(msp->ms_freedtree,
-                   msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+               range_tree_vacate(msp->ms_freed,
+                   msp->ms_loaded ? range_tree_add : NULL,
+                   msp->ms_allocatable);
        }
-
        space_map_update(msp->ms_sm);
 
        msp->ms_deferspace += defer_delta;
@@ -2426,16 +2516,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
                for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
                        VERIFY0(range_tree_space(
-                           msp->ms_alloctree[(txg + t) & TXG_MASK]));
+                           msp->ms_allocating[(txg + t) & TXG_MASK]));
                }
 
                if (!metaslab_debug_unload)
                        metaslab_unload(msp);
        }
 
-       ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-       ASSERT0(range_tree_space(msp->ms_freeingtree));
-       ASSERT0(range_tree_space(msp->ms_freedtree));
+       ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+       ASSERT0(range_tree_space(msp->ms_freeing));
+       ASSERT0(range_tree_space(msp->ms_freed));
+       ASSERT0(range_tree_space(msp->ms_checkpointing));
 
        mutex_exit(&msp->ms_lock);
 }
@@ -2666,7 +2757,7 @@ static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
        uint64_t start;
-       range_tree_t *rt = msp->ms_tree;
+       range_tree_t *rt = msp->ms_allocatable;
        metaslab_class_t *mc = msp->ms_group->mg_class;
 
        VERIFY(!msp->ms_condensing);
@@ -2681,10 +2772,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
                VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
                range_tree_remove(rt, start, size);
 
-               if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+               if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                        vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-               range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+               range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 
                /* Track the last successful allocation */
                msp->ms_alloc_txg = txg;
@@ -3183,12 +3274,11 @@ next:
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
        metaslab_t *msp;
-       ASSERTV(spa_t *spa = vd->vdev_spa);
+       spa_t *spa = vd->vdev_spa;
 
-       ASSERT3U(txg, ==, spa->spa_syncing_txg);
        ASSERT(vdev_is_concrete(vd));
        ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
        ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
@@ -3202,11 +3292,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
        VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
        metaslab_check_free_impl(vd, offset, asize);
+
        mutex_enter(&msp->ms_lock);
-       if (range_tree_space(msp->ms_freeingtree) == 0) {
-               vdev_dirty(vd, VDD_METASLAB, msp, txg);
+       if (range_tree_is_empty(msp->ms_freeing) &&
+           range_tree_is_empty(msp->ms_checkpointing)) {
+               vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+       }
+
+       if (checkpoint) {
+               ASSERT(spa_has_checkpoint(spa));
+               range_tree_add(msp->ms_checkpointing, offset, asize);
+       } else {
+               range_tree_add(msp->ms_freeing, offset, asize);
        }
-       range_tree_add(msp->ms_freeingtree, offset, asize);
        mutex_exit(&msp->ms_lock);
 }
 
@@ -3215,23 +3313,25 @@ void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
-       uint64_t *txgp = arg;
+       boolean_t *checkpoint = arg;
+
+       ASSERT3P(checkpoint, !=, NULL);
 
        if (vd->vdev_ops->vdev_op_remap != NULL)
-               vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+               vdev_indirect_mark_obsolete(vd, offset, size);
        else
-               metaslab_free_impl(vd, offset, size, *txgp);
+               metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
        spa_t *spa = vd->vdev_spa;
 
        ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
-       if (txg > spa_freeze_txg(spa))
+       if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
                return;
 
        if (spa->spa_vdev_removal != NULL &&
@@ -3243,13 +3343,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
                 * an indirect vdev (in open context), and then (in syncing
                 * context) clear spa_vdev_removal.
                 */
-               free_from_removing_vdev(vd, offset, size, txg);
+               free_from_removing_vdev(vd, offset, size);
        } else if (vd->vdev_ops->vdev_op_remap != NULL) {
-               vdev_indirect_mark_obsolete(vd, offset, size, txg);
+               vdev_indirect_mark_obsolete(vd, offset, size);
                vd->vdev_ops->vdev_op_remap(vd, offset, size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
        } else {
-               metaslab_free_concrete(vd, offset, size, txg);
+               metaslab_free_concrete(vd, offset, size, checkpoint);
        }
 }
 
@@ -3426,26 +3526,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
        msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
        mutex_enter(&msp->ms_lock);
-       range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+       range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
            offset, size);
 
        VERIFY(!msp->ms_condensing);
        VERIFY3U(offset, >=, msp->ms_start);
        VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
-       VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+       VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
            msp->ms_size);
        VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
        VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-       range_tree_add(msp->ms_tree, offset, size);
+       range_tree_add(msp->ms_allocatable, offset, size);
        mutex_exit(&msp->ms_lock);
 }
 
 /*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * Free the block represented by the given DVA.
  */
 void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
        uint64_t vdev = DVA_GET_VDEV(dva);
        uint64_t offset = DVA_GET_OFFSET(dva);
@@ -3459,7 +3558,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
                size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
        }
 
-       metaslab_free_impl(vd, offset, size, txg);
+       metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
@@ -3529,7 +3628,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
        if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
                error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-       if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+       if (error == 0 &&
+           !range_tree_contains(msp->ms_allocatable, offset, size))
                error = SET_ERROR(ENOENT);
 
        if (error || txg == 0) {        /* txg == 0 indicates dry run */
@@ -3540,13 +3640,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
        VERIFY(!msp->ms_condensing);
        VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
        VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-       VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
-       range_tree_remove(msp->ms_tree, offset, size);
+       VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+           msp->ms_size);
+       range_tree_remove(msp->ms_allocatable, offset, size);
 
        if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
-               if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+               if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                        vdev_dirty(vd, VDD_METASLAB, msp, txg);
-               range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+               range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+                   offset, size);
        }
 
        mutex_exit(&msp->ms_lock);
@@ -3691,13 +3793,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
        ASSERT(!BP_IS_HOLE(bp));
        ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
+       /*
+        * If we have a checkpoint for the pool we need to make sure that
+        * the blocks that we free that are part of the checkpoint won't be
+        * reused until the checkpoint is discarded or we revert to it.
+        *
+        * The checkpoint flag is passed down the metaslab_free code path
+        * and is set whenever we want to add a block to the checkpoint's
+        * accounting. That is, we "checkpoint" blocks that existed at the
+        * time the checkpoint was created and are therefore referenced by
+        * the checkpointed uberblock.
+        *
+        * Note that, we don't checkpoint any blocks if the current
+        * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+        * normally as they will be referenced by the checkpointed uberblock.
+        */
+       boolean_t checkpoint = B_FALSE;
+       if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+           spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+               /*
+                * At this point, if the block is part of the checkpoint
+                * there is no way it was created in the current txg.
+                */
+               ASSERT(!now);
+               ASSERT3U(spa_syncing_txg(spa), ==, txg);
+               checkpoint = B_TRUE;
+       }
+
        spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
        for (int d = 0; d < ndvas; d++) {
                if (now) {
                        metaslab_unalloc_dva(spa, &dva[d], txg);
                } else {
-                       metaslab_free_dva(spa, &dva[d], txg);
+                       ASSERT3U(txg, ==, spa_syncing_txg(spa));
+                       metaslab_free_dva(spa, &dva[d], checkpoint);
                }
        }
 
@@ -3818,12 +3948,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 
        mutex_enter(&msp->ms_lock);
        if (msp->ms_loaded)
-               range_tree_verify(msp->ms_tree, offset, size);
+               range_tree_verify(msp->ms_allocatable, offset, size);
 
-       range_tree_verify(msp->ms_freeingtree, offset, size);
-       range_tree_verify(msp->ms_freedtree, offset, size);
+       range_tree_verify(msp->ms_freeing, offset, size);
+       range_tree_verify(msp->ms_checkpointing, offset, size);
+       range_tree_verify(msp->ms_freed, offset, size);
        for (int j = 0; j < TXG_DEFER_SIZE; j++)
-               range_tree_verify(msp->ms_defertree[j], offset, size);
+               range_tree_verify(msp->ms_defer[j], offset, size);
        mutex_exit(&msp->ms_lock);
 }