]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/vdev_removal.c
Improved error handling for extreme rewinds
[mirror_zfs.git] / module / zfs / vdev_removal.c
index 826e5c421a6c6a1cf34b77211be555ab7d6292f6..4e4a6c4f5a250d0554e411860cf3fc7efd853dbd 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -99,6 +99,30 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  */
 int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
+/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops.  The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ *  - the mapping will be smaller, since one entry can cover more allocated
+ *    segments
+ *  - more of the fragmentation in the removing device will be preserved
+ *  - we'll do larger allocations, which may fail and fall back on smaller
+ *    allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+unsigned long zfs_remove_max_bytes_pause = -1UL;
+
 #define        VDEV_REMOVAL_ZAP_OBJS   "lzap"
 
 static void spa_vdev_remove_thread(void *arg);
@@ -227,7 +251,9 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
                    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
                    &one, tx));
-               ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+               ASSERTV(boolean_t are_precise);
+               ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+               ASSERT3B(are_precise, ==, B_TRUE);
        }
 
        vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
@@ -268,11 +294,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                 * be copied.
                 */
                spa->spa_removing_phys.sr_to_copy -=
-                   range_tree_space(ms->ms_freeingtree);
+                   range_tree_space(ms->ms_freeing);
 
-               ASSERT0(range_tree_space(ms->ms_freedtree));
+               ASSERT0(range_tree_space(ms->ms_freed));
                for (int t = 0; t < TXG_SIZE; t++)
-                       ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+                       ASSERT0(range_tree_space(ms->ms_allocating[t]));
        }
 
        /*
@@ -449,19 +475,18 @@ spa_restart_removal(spa_t *spa)
  * and we correctly free already-copied data.
  */
 void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
        spa_t *spa = vd->vdev_spa;
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+       uint64_t txg = spa_syncing_txg(spa);
        uint64_t max_offset_yet = 0;
 
        ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
        ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
            vdev_indirect_mapping_object(vim));
        ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
-       ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
        mutex_enter(&svr->svr_lock);
 
@@ -476,8 +501,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * held, so that the remove_thread can not load this metaslab and then
         * visit this offset between the time that we metaslab_free_concrete()
         * and when we check to see if it has been visited.
+        *
+        * Note: The checkpoint flag is set to false as having/taking
+        * a checkpoint and removing a device can't happen at the same
+        * time.
         */
-       metaslab_free_concrete(vd, offset, size, txg);
+       ASSERT(!spa_has_checkpoint(spa));
+       metaslab_free_concrete(vd, offset, size, B_FALSE);
 
        uint64_t synced_size = 0;
        uint64_t synced_offset = 0;
@@ -609,16 +639,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * of this free.
         */
        if (synced_size > 0) {
-               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-                   txg);
+               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
                /*
                 * Note: this can only be called from syncing context,
                 * and the vdev_indirect_mapping is only changed from the
                 * sync thread, so we don't need svr_lock while doing
                 * metaslab_free_impl_cb.
                 */
+               boolean_t checkpoint = B_FALSE;
                vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
        }
 }
 
@@ -666,10 +697,10 @@ static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
        vdev_t *vd = arg;
-       vdev_indirect_mark_obsolete(vd, offset, size,
-           vd->vdev_spa->spa_syncing_txg);
+       vdev_indirect_mark_obsolete(vd, offset, size);
+       boolean_t checkpoint = B_FALSE;
        vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-           metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+           metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
@@ -710,13 +741,52 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
        spa_sync_removing_state(spa, tx);
 }
 
+typedef struct vdev_copy_segment_arg {
+       spa_t *vcsa_spa;
+       dva_t *vcsa_dest_dva;
+       uint64_t vcsa_txg;
+       range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+       vdev_copy_segment_arg_t *vcsa = arg;
+       spa_t *spa = vcsa->vcsa_spa;
+       blkptr_t bp = { { { {0} } } };
+
+       BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+       BP_SET_LSIZE(&bp, size);
+       BP_SET_PSIZE(&bp, size);
+       BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+       BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+       BP_SET_TYPE(&bp, DMU_OT_NONE);
+       BP_SET_LEVEL(&bp, 0);
+       BP_SET_DEDUP(&bp, 0);
+       BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+       DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+       DVA_SET_OFFSET(&bp.blk_dva[0],
+           DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+       DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+       zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
 /*
  * All reads and writes associated with a call to spa_vdev_copy_segment()
  * are done.
  */
 static void
-spa_vdev_copy_nullzio_done(zio_t *zio)
+spa_vdev_copy_segment_done(zio_t *zio)
 {
+       vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+       range_tree_vacate(vcsa->vcsa_obsolete_segs,
+           unalloc_seg, vcsa);
+       range_tree_destroy(vcsa->vcsa_obsolete_segs);
+       kmem_free(vcsa, sizeof (*vcsa));
+
        spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
@@ -833,7 +903,8 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
  * read from the old location and write to the new location.
  */
 static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+    uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
        metaslab_group_t *mg = vd->vdev_mg;
@@ -841,14 +912,80 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        vdev_indirect_mapping_entry_t *entry;
        dva_t dst = {{ 0 }};
+       uint64_t start = range_tree_min(segs);
 
-       ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 
-       int error = metaslab_alloc_dva(spa, mg->mg_class, size,
-           &dst, 0, NULL, txg, 0, zal);
+       uint64_t size = range_tree_span(segs);
+       if (range_tree_span(segs) > maxalloc) {
+               /*
+                * We can't allocate all the segments.  Prefer to end
+                * the allocation at the end of a segment, thus avoiding
+                * additional split blocks.
+                */
+               range_seg_t search;
+               avl_index_t where;
+               search.rs_start = start + maxalloc;
+               search.rs_end = search.rs_start;
+               range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+               if (rs == NULL) {
+                       rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+               } else {
+                       rs = AVL_PREV(&segs->rt_root, rs);
+               }
+               if (rs != NULL) {
+                       size = rs->rs_end - start;
+               } else {
+                       /*
+                        * There are no segments that end before maxalloc.
+                        * I.e. the first segment is larger than maxalloc,
+                        * so we must split it.
+                        */
+                       size = maxalloc;
+               }
+       }
+       ASSERT3U(size, <=, maxalloc);
+
+       /*
+        * An allocation class might not have any remaining vdevs or space
+        */
+       metaslab_class_t *mc = mg->mg_class;
+       if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+               mc = spa_normal_class(spa);
+       int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+           zal, 0);
+       if (error == ENOSPC && mc != spa_normal_class(spa)) {
+               error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+                   &dst, 0, NULL, txg, 0, zal, 0);
+       }
        if (error != 0)
                return (error);
 
+       /*
+        * Determine the ranges that are not actually needed.  Offsets are
+        * relative to the start of the range to be copied (i.e. relative to the
+        * local variable "start").
+        */
+       range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+       range_seg_t *rs = avl_first(&segs->rt_root);
+       ASSERT3U(rs->rs_start, ==, start);
+       uint64_t prev_seg_end = rs->rs_end;
+       while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+               if (rs->rs_start >= start + size) {
+                       break;
+               } else {
+                       range_tree_add(obsolete_segs,
+                           prev_seg_end - start,
+                           rs->rs_start - prev_seg_end);
+               }
+               prev_seg_end = rs->rs_end;
+       }
+       /* We don't end in the middle of an obsolete range */
+       ASSERT3U(start + size, <=, prev_seg_end);
+
+       range_tree_clear(segs, start, size);
+
        /*
         * We can't have any padding of the allocated size, otherwise we will
         * misunderstand what's allocated, and the size of the mapping.
@@ -860,13 +997,22 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
        entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
        DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
        entry->vime_mapping.vimep_dst = dst;
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+               entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+       }
+
+       vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+       vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+       vcsa->vcsa_obsolete_segs = obsolete_segs;
+       vcsa->vcsa_spa = spa;
+       vcsa->vcsa_txg = txg;
 
        /*
         * See comment before spa_vdev_copy_one_child().
         */
        spa_config_enter(spa, SCL_STATE, spa, RW_READER);
        zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
-           spa_vdev_copy_nullzio_done, NULL, 0);
+           spa_vdev_copy_segment_done, vcsa, 0);
        vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
        if (dest_vd->vdev_ops == &vdev_mirror_ops) {
                for (int i = 0; i < dest_vd->vdev_children; i++) {
@@ -1069,39 +1215,79 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 
        mutex_enter(&svr->svr_lock);
 
-       range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
-       if (rs == NULL) {
+       /*
+        * Determine how big of a chunk to copy.  We can allocate up
+        * to max_alloc bytes, and we can span up to vdev_removal_max_span
+        * bytes of unallocated space at a time.  "segs" will track the
+        * allocated segments that we are copying.  We may also be copying
+        * free segments (of up to vdev_removal_max_span bytes).
+        */
+       range_tree_t *segs = range_tree_create(NULL, NULL);
+       for (;;) {
+               range_seg_t *rs = range_tree_first(svr->svr_allocd_segs);
+
+               if (rs == NULL)
+                       break;
+
+               uint64_t seg_length;
+
+               if (range_tree_is_empty(segs)) {
+                       /* need to truncate the first seg based on max_alloc */
+                       seg_length =
+                           MIN(rs->rs_end - rs->rs_start, *max_alloc);
+               } else {
+                       if (rs->rs_start - range_tree_max(segs) >
+                           vdev_removal_max_span) {
+                               /*
+                                * Including this segment would cause us to
+                                * copy a larger unneeded chunk than is allowed.
+                                */
+                               break;
+                       } else if (rs->rs_end - range_tree_min(segs) >
+                           *max_alloc) {
+                               /*
+                                * This additional segment would extend past
+                                * max_alloc. Rather than splitting this
+                                * segment, leave it for the next mapping.
+                                */
+                               break;
+                       } else {
+                               seg_length = rs->rs_end - rs->rs_start;
+                       }
+               }
+
+               range_tree_add(segs, rs->rs_start, seg_length);
+               range_tree_remove(svr->svr_allocd_segs,
+                   rs->rs_start, seg_length);
+       }
+
+       if (range_tree_is_empty(segs)) {
                mutex_exit(&svr->svr_lock);
+               range_tree_destroy(segs);
                return;
        }
-       uint64_t offset = rs->rs_start;
-       uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
-
-       range_tree_remove(svr->svr_allocd_segs, offset, length);
 
        if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
                dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
                    svr, 0, ZFS_SPACE_CHECK_NONE, tx);
        }
 
-       svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+       svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
 
        /*
         * Note: this is the amount of *allocated* space
         * that we are taking care of each txg.
         */
-       svr->svr_bytes_done[txg & TXG_MASK] += length;
+       svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
 
        mutex_exit(&svr->svr_lock);
 
        zio_alloc_list_t zal;
        metaslab_trace_init(&zal);
-       uint64_t thismax = *max_alloc;
-       while (length > 0) {
-               uint64_t mylen = MIN(length, thismax);
-
+       uint64_t thismax = SPA_MAXBLOCKSIZE;
+       while (!range_tree_is_empty(segs)) {
                int error = spa_vdev_copy_segment(vd,
-                   offset, mylen, txg, vca, &zal);
+                   segs, thismax, txg, vca, &zal);
 
                if (error == ENOSPC) {
                        /*
@@ -1115,18 +1301,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
                         */
                        ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
                        ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
-                       thismax = P2ROUNDUP(mylen / 2,
+                       uint64_t attempted =
+                           MIN(range_tree_span(segs), thismax);
+                       thismax = P2ROUNDUP(attempted / 2,
                            1 << spa->spa_max_ashift);
-                       ASSERT3U(thismax, <, mylen);
                        /*
                         * The minimum-size allocation can not fail.
                         */
-                       ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
-                       *max_alloc = mylen - (1 << spa->spa_max_ashift);
+                       ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+                       *max_alloc = attempted - (1 << spa->spa_max_ashift);
                } else {
                        ASSERT0(error);
-                       length -= mylen;
-                       offset += mylen;
 
                        /*
                         * We've performed an allocation, so reset the
@@ -1137,6 +1322,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
                }
        }
        metaslab_trace_fini(&zal);
+       range_tree_destroy(segs);
 }
 
 /*
@@ -1200,7 +1386,7 @@ spa_vdev_remove_thread(void *arg)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++) {
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                }
 
                /*
@@ -1230,7 +1416,7 @@ spa_vdev_remove_thread(void *arg)
                            SM_ALLOC));
                        space_map_close(sm);
 
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1249,7 +1435,7 @@ spa_vdev_remove_thread(void *arg)
                    msp->ms_id);
 
                while (!svr->svr_thread_exit &&
-                   range_tree_space(svr->svr_allocd_segs) != 0) {
+                   !range_tree_is_empty(svr->svr_allocd_segs)) {
 
                        mutex_exit(&svr->svr_lock);
 
@@ -1264,6 +1450,19 @@ spa_vdev_remove_thread(void *arg)
                         */
                        spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+                       /*
+                        * This delay will pause the removal around the point
+                        * specified by zfs_remove_max_bytes_pause. We do this
+                        * solely from the test suite or during debugging.
+                        */
+                       uint64_t bytes_copied =
+                           spa->spa_removing_phys.sr_copied;
+                       for (int i = 0; i < TXG_SIZE; i++)
+                               bytes_copied += svr->svr_bytes_done[i];
+                       while (zfs_remove_max_bytes_pause <= bytes_copied &&
+                           !svr->svr_thread_exit)
+                               delay(hz);
+
                        mutex_enter(&vca.vca_lock);
                        while (vca.vca_outstanding_bytes >
                            zfs_remove_max_copy_bytes) {
@@ -1366,15 +1565,20 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
        ASSERT3P(svr->svr_thread, ==, NULL);
 
        spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
-       if (vdev_obsolete_counts_are_precise(vd)) {
+
+       boolean_t are_precise;
+       VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+       if (are_precise) {
                spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
                VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
                    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
        }
 
-       if (vdev_obsolete_sm_object(vd) != 0) {
+       uint64_t obsolete_sm_object;
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+       if (obsolete_sm_object != 0) {
                ASSERT(vd->vdev_obsolete_sm != NULL);
-               ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+               ASSERT3U(obsolete_sm_object, ==,
                    space_map_object(vd->vdev_obsolete_sm));
 
                space_map_free(vd->vdev_obsolete_sm, tx);
@@ -1404,10 +1608,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                for (int i = 0; i < TXG_DEFER_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_defertree[i]));
-               ASSERT0(range_tree_space(msp->ms_freedtree));
+                       ASSERT0(range_tree_space(msp->ms_defer[i]));
+               ASSERT0(range_tree_space(msp->ms_freed));
 
                if (msp->ms_sm != NULL) {
                        /*
@@ -1423,7 +1627,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                        mutex_enter(&svr->svr_lock);
                        VERIFY0(space_map_load(msp->ms_sm,
                            svr->svr_allocd_segs, SM_ALLOC));
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1499,7 +1703,8 @@ spa_vdev_remove_cancel(spa_t *spa)
        uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
        int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-           spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+           spa_vdev_remove_cancel_sync, NULL, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
 
        if (error == 0) {
                spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1665,15 +1870,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
        if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
                return (SET_ERROR(ENOTSUP));
 
+       /* available space in the pool's normal class */
+       uint64_t available = dsl_dir_space_available(
+           spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+       metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+       /*
+        * When removing a vdev from an allocation class that has
+        * remaining vdevs, include available space from the class.
+        */
+       if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+               uint64_t class_avail = metaslab_class_get_space(mc) -
+                   metaslab_class_get_alloc(mc);
+
+               /* add class space, adjusted for overhead */
+               available += (class_avail * 94) / 100;
+       }
+
        /*
         * There has to be enough free space to remove the
         * device and leave double the "slop" space (i.e. we
         * must leave at least 3% of the pool free, in addition to
         * the normal slop space).
         */
-       if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
-           NULL, 0, B_TRUE) <
-           vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+       if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
                return (SET_ERROR(ENOSPC));
        }
 
@@ -1836,6 +2057,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
        if (!locked)
                txg = spa_vdev_enter(spa);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+               if (!locked)
+                       return (spa_vdev_exit(spa, NULL, txg, error));
+
+               return (error);
+       }
+
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
        if (spa->spa_spares.sav_vdevs != NULL &&
@@ -1939,11 +2171,22 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
        return (0);
 }
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
 module_param(zfs_remove_max_segment, int, 0644);
 MODULE_PARM_DESC(zfs_remove_max_segment,
        "Largest contiguous segment to allocate when removing device");
 
+module_param(vdev_removal_max_span, int, 0644);
+MODULE_PARM_DESC(vdev_removal_max_span,
+       "Largest span of free chunks a remap segment can span");
+
+/* BEGIN CSTYLED */
+module_param(zfs_remove_max_bytes_pause, ulong, 0644);
+MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+       "Pause device removal after this many bytes are copied "
+       "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);