Fix ztest deadlock in spa_vdev_remove()

[mirror_zfs.git] / module / zfs / vdev_removal.c
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c

index 6e81bf014894d5a04f456e6eb1017d95dc17149f..a706bc2a425a186d50dbd3dcc2e6dc9c8f61b3c9 100644 (file)
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -21,7 +21,7 @@
  
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
   */
  
  #include <sys/zfs_context.h>
@@ -80,22 +80,18 @@
  typedef struct vdev_copy_arg {
         metaslab_t      *vca_msp;
         uint64_t        vca_outstanding_bytes;
+       uint64_t        vca_read_error_bytes;
+       uint64_t        vca_write_error_bytes;
         kcondvar_t      vca_cv;
         kmutex_t        vca_lock;
  } vdev_copy_arg_t;
  
-typedef struct vdev_copy_seg_arg {
-       vdev_copy_arg_t *vcsa_copy_arg;
-       uint64_t        vcsa_txg;
-       dva_t           *vcsa_dest_dva;
-       blkptr_t        *vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
  /*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal.  This determines how much i/o we can have
+ * in flight concurrently.
   */
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  
  /*
   * The largest contiguous segment that we will attempt to allocate when
@@ -105,9 +101,42 @@ int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
   */
  int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
  
+/*
+ * Ignore hard IO errors during device removal.  When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled.  This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops.  The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ *  - the mapping will be smaller, since one entry can cover more allocated
+ *    segments
+ *  - more of the fragmentation in the removing device will be preserved
+ *  - we'll do larger allocations, which may fail and fall back on smaller
+ *    allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+int zfs_removal_suspend_progress = 0;
+
  #define        VDEV_REMOVAL_ZAP_OBJS   "lzap"
  
  static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
  
  static void
  spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
@@ -165,7 +194,7 @@ spa_vdev_removal_create(vdev_t *vd)
         mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
         svr->svr_allocd_segs = range_tree_create(NULL, NULL);
-       svr->svr_vdev = vd;
+       svr->svr_vdev_id = vd->vdev_id;
  
         for (int i = 0; i < TXG_SIZE; i++) {
                 svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -207,9 +236,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
  static void
  vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
  {
-       vdev_t *vd = arg;
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
         vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-       spa_t *spa = vd->vdev_spa;
         objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
         spa_vdev_removal_t *svr = NULL;
         ASSERTV(uint64_t txg = dmu_tx_get_txg(tx));
@@ -232,7 +262,9 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                 VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
                     VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
                     &one, tx));
-               ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+               ASSERTV(boolean_t are_precise);
+               ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+               ASSERT3B(are_precise, ==, B_TRUE);
         }
  
         vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
@@ -273,11 +305,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                  * be copied.
                  */
                 spa->spa_removing_phys.sr_to_copy -=
-                   range_tree_space(ms->ms_freeingtree);
+                   range_tree_space(ms->ms_freeing);
  
-               ASSERT0(range_tree_space(ms->ms_freedtree));
+               ASSERT0(range_tree_space(ms->ms_freed));
                 for (int t = 0; t < TXG_SIZE; t++)
-                       ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+                       ASSERT0(range_tree_space(ms->ms_allocating[t]));
         }
  
         /*
@@ -331,7 +363,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
         ASSERT3P(spa->spa_vdev_removal, ==, NULL);
         spa->spa_vdev_removal = svr;
         svr->svr_thread = thread_create(NULL, 0,
-           spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+           spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
  }
  
  /*
@@ -357,6 +389,7 @@ spa_remove_init(spa_t *spa)
                 spa->spa_removing_phys.sr_state = DSS_NONE;
                 spa->spa_removing_phys.sr_removing_vdev = -1;
                 spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+               spa->spa_indirect_vdevs_loaded = B_TRUE;
                 return (0);
         } else if (error != 0) {
                 return (error);
@@ -372,21 +405,24 @@ spa_remove_init(spa_t *spa)
                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
                 vdev_t *vd = vdev_lookup_top(spa,
                     spa->spa_removing_phys.sr_removing_vdev);
-               spa_config_exit(spa, SCL_STATE, FTAG);
  
-               if (vd == NULL)
+               if (vd == NULL) {
+                       spa_config_exit(spa, SCL_STATE, FTAG);
                         return (EINVAL);
+               }
  
                 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
  
                 ASSERT(vdev_is_concrete(vd));
                 spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
-               ASSERT(svr->svr_vdev->vdev_removing);
+               ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+               ASSERT(vd->vdev_removing);
  
                 vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
                     spa->spa_meta_objset, vic->vic_mapping_object);
                 vd->vdev_indirect_births = vdev_indirect_births_open(
                     spa->spa_meta_objset, vic->vic_births_object);
+               spa_config_exit(spa, SCL_STATE, FTAG);
  
                 spa->spa_vdev_removal = svr;
         }
@@ -439,15 +475,8 @@ spa_restart_removal(spa_t *spa)
         if (!spa_writeable(spa))
                 return;
  
-       vdev_t *vd = svr->svr_vdev;
-       vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-       ASSERT3P(vd, !=, NULL);
-       ASSERT(vd->vdev_removing);
-
-       zfs_dbgmsg("restarting removal of %llu at count=%llu",
-           vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
-       svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+       zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+       svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
             0, &p0, TS_RUN, minclsyspri);
  }
  
@@ -457,19 +486,18 @@ spa_restart_removal(spa_t *spa)
   * and we correctly free already-copied data.
   */
  void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
  {
         spa_t *spa = vd->vdev_spa;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+       uint64_t txg = spa_syncing_txg(spa);
         uint64_t max_offset_yet = 0;
  
         ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
         ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
             vdev_indirect_mapping_object(vim));
-       ASSERT3P(vd, ==, svr->svr_vdev);
-       ASSERT3U(spa_syncing_txg(spa), ==, txg);
+       ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
  
         mutex_enter(&svr->svr_lock);
  
@@ -484,8 +512,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
          * held, so that the remove_thread can not load this metaslab and then
          * visit this offset between the time that we metaslab_free_concrete()
          * and when we check to see if it has been visited.
+        *
+        * Note: The checkpoint flag is set to false as having/taking
+        * a checkpoint and removing a device can't happen at the same
+        * time.
          */
-       metaslab_free_concrete(vd, offset, size, txg);
+       ASSERT(!spa_has_checkpoint(spa));
+       metaslab_free_concrete(vd, offset, size, B_FALSE);
  
         uint64_t synced_size = 0;
         uint64_t synced_offset = 0;
@@ -617,16 +650,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
          * of this free.
          */
         if (synced_size > 0) {
-               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-                   txg);
+               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
                 /*
                  * Note: this can only be called from syncing context,
                  * and the vdev_indirect_mapping is only changed from the
                  * sync thread, so we don't need svr_lock while doing
                  * metaslab_free_impl_cb.
                  */
+               boolean_t checkpoint = B_FALSE;
                 vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
         }
  }
  
@@ -646,10 +680,10 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
  
         if (state == DSS_FINISHED) {
                 spa_removing_phys_t *srp = &spa->spa_removing_phys;
-               vdev_t *vd = svr->svr_vdev;
+               vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
                 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
  
-               if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+               if (srp->sr_prev_indirect_vdev != -1) {
                         vdev_t *pvd;
                         pvd = vdev_lookup_top(spa,
                             srp->sr_prev_indirect_vdev);
@@ -674,10 +708,10 @@ static void
  free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
  {
         vdev_t *vd = arg;
-       vdev_indirect_mark_obsolete(vd, offset, size,
-           vd->vdev_spa->spa_syncing_txg);
+       vdev_indirect_mark_obsolete(vd, offset, size);
+       boolean_t checkpoint = B_FALSE;
         vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-           metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+           metaslab_free_impl_cb, &checkpoint);
  }
  
  /*
@@ -690,7 +724,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
  {
         spa_vdev_removal_t *svr = arg;
         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-       vdev_t *vd = svr->svr_vdev;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config);
         uint64_t txg = dmu_tx_get_txg(tx);
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -718,84 +752,283 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
         spa_sync_removing_state(spa, tx);
  }
  
+typedef struct vdev_copy_segment_arg {
+       spa_t *vcsa_spa;
+       dva_t *vcsa_dest_dva;
+       uint64_t vcsa_txg;
+       range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+       vdev_copy_segment_arg_t *vcsa = arg;
+       spa_t *spa = vcsa->vcsa_spa;
+       blkptr_t bp = { { { {0} } } };
+
+       BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+       BP_SET_LSIZE(&bp, size);
+       BP_SET_PSIZE(&bp, size);
+       BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+       BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+       BP_SET_TYPE(&bp, DMU_OT_NONE);
+       BP_SET_LEVEL(&bp, 0);
+       BP_SET_DEDUP(&bp, 0);
+       BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+       DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+       DVA_SET_OFFSET(&bp.blk_dva[0],
+           DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+       DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+       zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+       vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+       range_tree_vacate(vcsa->vcsa_obsolete_segs,
+           unalloc_seg, vcsa);
+       range_tree_destroy(vcsa->vcsa_obsolete_segs);
+       kmem_free(vcsa, sizeof (*vcsa));
+
+       spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
  static void
  spa_vdev_copy_segment_write_done(zio_t *zio)
  {
-       vdev_copy_seg_arg_t *vcsa = zio->io_private;
-       vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
-       spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+       vdev_copy_arg_t *vca = zio->io_private;
+
         abd_free(zio->io_abd);
  
         mutex_enter(&vca->vca_lock);
         vca->vca_outstanding_bytes -= zio->io_size;
+
+       if (zio->io_error != 0)
+               vca->vca_write_error_bytes += zio->io_size;
+
         cv_signal(&vca->vca_cv);
         mutex_exit(&vca->vca_lock);
-
-       ASSERT0(zio->io_error);
-       kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
-       kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
  }
  
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
  static void
  spa_vdev_copy_segment_read_done(zio_t *zio)
  {
-       vdev_copy_seg_arg_t *vcsa = zio->io_private;
-       dva_t *dest_dva = vcsa->vcsa_dest_dva;
-       uint64_t txg = vcsa->vcsa_txg;
-       spa_t *spa = zio->io_spa;
-       ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)));
-       blkptr_t *bp = NULL;
-       dva_t *dva = NULL;
-       uint64_t size = zio->io_size;
-
-       ASSERT3P(dest_vd, !=, NULL);
-       ASSERT0(zio->io_error);
-
-       vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-       bp = vcsa->vcsa_dest_bp;
-       dva = bp->blk_dva;
-
-       BP_ZERO(bp);
-
-       /* initialize with dest_dva */
-       bcopy(dest_dva, dva, sizeof (dva_t));
-       BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-       BP_SET_LSIZE(bp, size);
-       BP_SET_PSIZE(bp, size);
-       BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-       BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-       BP_SET_TYPE(bp, DMU_OT_NONE);
-       BP_SET_LEVEL(bp, 0);
-       BP_SET_DEDUP(bp, 0);
-       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-       zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
-           txg, bp, zio->io_abd, size,
-           spa_vdev_copy_segment_write_done, vcsa,
-           ZIO_PRIORITY_REMOVAL, 0, NULL));
+       vdev_copy_arg_t *vca = zio->io_private;
+
+       if (zio->io_error != 0) {
+               mutex_enter(&vca->vca_lock);
+               vca->vca_read_error_bytes += zio->io_size;
+               mutex_exit(&vca->vca_lock);
+       }
+
+       zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible.  Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs.  However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads.  If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ *                            null
+ *                           /    \
+ *    write(new vdev, child 0)      write(new vdev, child 1)
+ *      |                             |
+ *    read(old vdev, child 0)       read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete.  However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete.  In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*.  We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+    vdev_t *source_vd, uint64_t source_offset,
+    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+       ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+       /*
+        * If the destination child in unwritable then there is no point
+        * in issuing the source reads which cannot be written.
+        */
+       if (!vdev_writeable(dest_child_vd))
+               return;
+
+       mutex_enter(&vca->vca_lock);
+       vca->vca_outstanding_bytes += size;
+       mutex_exit(&vca->vca_lock);
+
+       abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+       vdev_t *source_child_vd = NULL;
+       if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+               /*
+                * Source and dest are both mirrors.  Copy from the same
+                * child id as we are copying to (wrapping around if there
+                * are more dest children than source children).  If the
+                * preferred source child is unreadable select another.
+                */
+               for (int i = 0; i < source_vd->vdev_children; i++) {
+                       source_child_vd = source_vd->vdev_child[
+                           (dest_id + i) % source_vd->vdev_children];
+                       if (vdev_readable(source_child_vd))
+                               break;
+               }
+       } else {
+               source_child_vd = source_vd;
+       }
+
+       /*
+        * There should always be at least one readable source child or
+        * the pool would be in a suspended state.  Somehow selecting an
+        * unreadable child would result in IO errors, the removal process
+        * being cancelled, and the pool reverting to its pre-removal state.
+        */
+       ASSERT3P(source_child_vd, !=, NULL);
+
+       zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+           dest_child_vd, dest_offset, abd, size,
+           ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+           ZIO_FLAG_CANFAIL,
+           spa_vdev_copy_segment_write_done, vca);
+
+       zio_nowait(zio_vdev_child_io(write_zio, NULL,
+           source_child_vd, source_offset, abd, size,
+           ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+           ZIO_FLAG_CANFAIL,
+           spa_vdev_copy_segment_read_done, vca));
  }
  
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
  static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+    uint64_t maxalloc, uint64_t txg,
      vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
  {
         metaslab_group_t *mg = vd->vdev_mg;
         spa_t *spa = vd->vdev_spa;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
         vdev_indirect_mapping_entry_t *entry;
-       vdev_copy_seg_arg_t *private;
         dva_t dst = {{ 0 }};
-       blkptr_t blk, *bp = &blk;
-       dva_t *dva = bp->blk_dva;
+       uint64_t start = range_tree_min(segs);
  
-       ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
  
-       int error = metaslab_alloc_dva(spa, mg->mg_class, size,
-           &dst, 0, NULL, txg, 0, zal);
+       uint64_t size = range_tree_span(segs);
+       if (range_tree_span(segs) > maxalloc) {
+               /*
+                * We can't allocate all the segments.  Prefer to end
+                * the allocation at the end of a segment, thus avoiding
+                * additional split blocks.
+                */
+               range_seg_t search;
+               avl_index_t where;
+               search.rs_start = start + maxalloc;
+               search.rs_end = search.rs_start;
+               range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+               if (rs == NULL) {
+                       rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+               } else {
+                       rs = AVL_PREV(&segs->rt_root, rs);
+               }
+               if (rs != NULL) {
+                       size = rs->rs_end - start;
+               } else {
+                       /*
+                        * There are no segments that end before maxalloc.
+                        * I.e. the first segment is larger than maxalloc,
+                        * so we must split it.
+                        */
+                       size = maxalloc;
+               }
+       }
+       ASSERT3U(size, <=, maxalloc);
+
+       /*
+        * An allocation class might not have any remaining vdevs or space
+        */
+       metaslab_class_t *mc = mg->mg_class;
+       if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+               mc = spa_normal_class(spa);
+       int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+           zal, 0);
+       if (error == ENOSPC && mc != spa_normal_class(spa)) {
+               error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+                   &dst, 0, NULL, txg, 0, zal, 0);
+       }
         if (error != 0)
                 return (error);
  
+       /*
+        * Determine the ranges that are not actually needed.  Offsets are
+        * relative to the start of the range to be copied (i.e. relative to the
+        * local variable "start").
+        */
+       range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+       range_seg_t *rs = avl_first(&segs->rt_root);
+       ASSERT3U(rs->rs_start, ==, start);
+       uint64_t prev_seg_end = rs->rs_end;
+       while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+               if (rs->rs_start >= start + size) {
+                       break;
+               } else {
+                       range_tree_add(obsolete_segs,
+                           prev_seg_end - start,
+                           rs->rs_start - prev_seg_end);
+               }
+               prev_seg_end = rs->rs_end;
+       }
+       /* We don't end in the middle of an obsolete range */
+       ASSERT3U(start + size, <=, prev_seg_end);
+
+       range_tree_clear(segs, start, size);
+
         /*
          * We can't have any padding of the allocated size, otherwise we will
          * misunderstand what's allocated, and the size of the mapping.
@@ -804,51 +1037,37 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
          */
         ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
  
-       mutex_enter(&vca->vca_lock);
-       vca->vca_outstanding_bytes += size;
-       mutex_exit(&vca->vca_lock);
-
         entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
         DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
         entry->vime_mapping.vimep_dst = dst;
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+               entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+       }
  
-       private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
-       private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
-       private->vcsa_txg = txg;
-       private->vcsa_copy_arg = vca;
-
-       /*
-        * This lock is eventually released by the donefunc for the
-        * zio_write_phys that finishes copying the data.
-        */
-       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+       vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+       vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+       vcsa->vcsa_obsolete_segs = obsolete_segs;
+       vcsa->vcsa_spa = spa;
+       vcsa->vcsa_txg = txg;
  
         /*
-        * Do logical I/O, letting the redundancy vdevs (like mirror)
-        * handle their own I/O instead of duplicating that code here.
+        * See comment before spa_vdev_copy_one_child().
          */
-       BP_ZERO(bp);
-
-       DVA_SET_VDEV(&dva[0], vd->vdev_id);
-       DVA_SET_OFFSET(&dva[0], start);
-       DVA_SET_GANG(&dva[0], 0);
-       DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
-       BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-       BP_SET_LSIZE(bp, size);
-       BP_SET_PSIZE(bp, size);
-       BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-       BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-       BP_SET_TYPE(bp, DMU_OT_NONE);
-       BP_SET_LEVEL(bp, 0);
-       BP_SET_DEDUP(bp, 0);
-       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-       zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
-           bp, abd_alloc_for_io(size, B_FALSE), size,
-           spa_vdev_copy_segment_read_done, private,
-           ZIO_PRIORITY_REMOVAL, 0, NULL));
+       spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+       zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+           spa_vdev_copy_segment_done, vcsa, 0);
+       vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+       if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+               for (int i = 0; i < dest_vd->vdev_children; i++) {
+                       vdev_t *child = dest_vd->vdev_child[i];
+                       spa_vdev_copy_one_child(vca, nzio, vd, start,
+                           child, DVA_GET_OFFSET(&dst), i, size);
+               }
+       } else {
+               spa_vdev_copy_one_child(vca, nzio, vd, start,
+                   dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+       }
+       zio_nowait(nzio);
  
         list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
         ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -866,8 +1085,8 @@ static void
  vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
  {
         spa_vdev_removal_t *svr = arg;
-       vdev_t *vd = svr->svr_vdev;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
  
         ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
  
@@ -895,37 +1114,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
             "%s vdev %llu", spa_name(spa), vd->vdev_id);
  }
  
-static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
-       ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
-       ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
-       ASSERT(vd->vdev_indirect_mapping != NULL);
-       ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
-       vd->vdev_indirect_mapping = NULL;
-
-       ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
-       ASSERT(vd->vdev_indirect_births != NULL);
-       ivd->vdev_indirect_births = vd->vdev_indirect_births;
-       vd->vdev_indirect_births = NULL;
-
-       ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
-       ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
-       if (vd->vdev_obsolete_sm != NULL) {
-               ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
-               /*
-                * We cannot use space_map_{open,close} because we hold all
-                * the config locks as writer.
-                */
-               ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
-               ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
-               vd->vdev_obsolete_sm = NULL;
-       }
-}
-
  static void
  vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
  {
@@ -961,32 +1149,25 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
         vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
  
         ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+       ivd->vdev_removing = 0;
  
         vd->vdev_leaf_zap = 0;
  
         vdev_remove_child(ivd, vd);
         vdev_compact_children(ivd);
  
-       vdev_indirect_state_transfer(ivd, vd);
-
-       svr->svr_vdev = ivd;
-
-       ASSERT(!ivd->vdev_removing);
         ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
  
-       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-       dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
-           0, ZFS_SPACE_CHECK_NONE, tx);
-       dmu_tx_commit(tx);
-
-       /*
-        * Indicate that this thread has exited.
-        * After this, we can not use svr.
-        */
         mutex_enter(&svr->svr_lock);
         svr->svr_thread = NULL;
         cv_broadcast(&svr->svr_cv);
         mutex_exit(&svr->svr_lock);
+
+       /* After this, we can not use svr. */
+       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+       dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+           0, ZFS_SPACE_CHECK_NONE, tx);
+       dmu_tx_commit(tx);
  }
  
  /*
@@ -994,9 +1175,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
   * context by the removal thread after we have copied all vdev's data.
   */
  static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
  {
-       spa_t *spa = vd->vdev_spa;
         uint64_t txg;
  
         /*
@@ -1004,8 +1184,12 @@ vdev_remove_complete(vdev_t *vd)
          * vdev_metaslab_fini()
          */
         txg_wait_synced(spa->spa_dsl_pool, 0);
-
         txg = spa_vdev_enter(spa);
+       vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+       sysevent_t *ev = spa_event_create(spa, vd, NULL,
+           ESC_ZFS_VDEV_REMOVE_DEV);
+
         zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
             vd->vdev_id, txg);
  
@@ -1025,6 +1209,10 @@ vdev_remove_complete(vdev_t *vd)
         /*
          * We now release the locks, allowing spa_sync to run and finish the
          * removal via vdev_remove_complete_sync in syncing context.
+        *
+        * Note that we hold on to the vdev_t that has been replaced.  Since
+        * it isn't part of the vdev tree any longer, it can't be concurrently
+        * manipulated, even while we don't have the config lock.
          */
         (void) spa_vdev_exit(spa, NULL, txg, 0);
  
@@ -1046,6 +1234,9 @@ vdev_remove_complete(vdev_t *vd)
          */
         vdev_config_dirty(spa->spa_root_vdev);
         (void) spa_vdev_exit(spa, vd, txg, 0);
+
+       if (ev != NULL)
+               spa_event_post(ev);
  }
  
  /*
@@ -1056,7 +1247,7 @@ vdev_remove_complete(vdev_t *vd)
   * this size again this txg.
   */
  static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
      uint64_t *max_alloc, dmu_tx_t *tx)
  {
         uint64_t txg = dmu_tx_get_txg(tx);
@@ -1064,39 +1255,79 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
  
         mutex_enter(&svr->svr_lock);
  
-       range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
-       if (rs == NULL) {
+       /*
+        * Determine how big of a chunk to copy.  We can allocate up
+        * to max_alloc bytes, and we can span up to vdev_removal_max_span
+        * bytes of unallocated space at a time.  "segs" will track the
+        * allocated segments that we are copying.  We may also be copying
+        * free segments (of up to vdev_removal_max_span bytes).
+        */
+       range_tree_t *segs = range_tree_create(NULL, NULL);
+       for (;;) {
+               range_seg_t *rs = range_tree_first(svr->svr_allocd_segs);
+
+               if (rs == NULL)
+                       break;
+
+               uint64_t seg_length;
+
+               if (range_tree_is_empty(segs)) {
+                       /* need to truncate the first seg based on max_alloc */
+                       seg_length =
+                           MIN(rs->rs_end - rs->rs_start, *max_alloc);
+               } else {
+                       if (rs->rs_start - range_tree_max(segs) >
+                           vdev_removal_max_span) {
+                               /*
+                                * Including this segment would cause us to
+                                * copy a larger unneeded chunk than is allowed.
+                                */
+                               break;
+                       } else if (rs->rs_end - range_tree_min(segs) >
+                           *max_alloc) {
+                               /*
+                                * This additional segment would extend past
+                                * max_alloc. Rather than splitting this
+                                * segment, leave it for the next mapping.
+                                */
+                               break;
+                       } else {
+                               seg_length = rs->rs_end - rs->rs_start;
+                       }
+               }
+
+               range_tree_add(segs, rs->rs_start, seg_length);
+               range_tree_remove(svr->svr_allocd_segs,
+                   rs->rs_start, seg_length);
+       }
+
+       if (range_tree_is_empty(segs)) {
                 mutex_exit(&svr->svr_lock);
+               range_tree_destroy(segs);
                 return;
         }
-       uint64_t offset = rs->rs_start;
-       uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
-
-       range_tree_remove(svr->svr_allocd_segs, offset, length);
  
         if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
                 dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
                     svr, 0, ZFS_SPACE_CHECK_NONE, tx);
         }
  
-       svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+       svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
  
         /*
          * Note: this is the amount of *allocated* space
          * that we are taking care of each txg.
          */
-       svr->svr_bytes_done[txg & TXG_MASK] += length;
+       svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
  
         mutex_exit(&svr->svr_lock);
  
         zio_alloc_list_t zal;
         metaslab_trace_init(&zal);
-       uint64_t thismax = *max_alloc;
-       while (length > 0) {
-               uint64_t mylen = MIN(length, thismax);
-
-               int error = spa_vdev_copy_segment(svr->svr_vdev,
-                   offset, mylen, txg, vca, &zal);
+       uint64_t thismax = SPA_MAXBLOCKSIZE;
+       while (!range_tree_is_empty(segs)) {
+               int error = spa_vdev_copy_segment(vd,
+                   segs, thismax, txg, vca, &zal);
  
                 if (error == ENOSPC) {
                         /*
@@ -1110,18 +1341,17 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
                          */
                         ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
                         ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
-                       thismax = P2ROUNDUP(mylen / 2,
+                       uint64_t attempted =
+                           MIN(range_tree_span(segs), thismax);
+                       thismax = P2ROUNDUP(attempted / 2,
                             1 << spa->spa_max_ashift);
-                       ASSERT3U(thismax, <, mylen);
                         /*
                          * The minimum-size allocation can not fail.
                          */
-                       ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
-                       *max_alloc = mylen - (1 << spa->spa_max_ashift);
+                       ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+                       *max_alloc = attempted - (1 << spa->spa_max_ashift);
                 } else {
                         ASSERT0(error);
-                       length -= mylen;
-                       offset += mylen;
  
                         /*
                          * We've performed an allocation, so reset the
@@ -1132,6 +1362,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
                 }
         }
         metaslab_trace_fini(&zal);
+       range_tree_destroy(segs);
  }
  
  /*
@@ -1153,12 +1384,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
  static void
  spa_vdev_remove_thread(void *arg)
  {
-       vdev_t *vd = arg;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = arg;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
         vdev_copy_arg_t vca;
         uint64_t max_alloc = zfs_remove_max_segment;
         uint64_t last_txg = 0;
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
         uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
  
@@ -1166,12 +1399,13 @@ spa_vdev_remove_thread(void *arg)
         ASSERT(vdev_is_concrete(vd));
         ASSERT(vd->vdev_removing);
         ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-       ASSERT3P(svr->svr_vdev, ==, vd);
         ASSERT(vim != NULL);
  
         mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
         vca.vca_outstanding_bytes = 0;
+       vca.vca_read_error_bytes = 0;
+       vca.vca_write_error_bytes = 0;
  
         mutex_enter(&svr->svr_lock);
  
@@ -1194,7 +1428,7 @@ spa_vdev_remove_thread(void *arg)
                  * Assert nothing in flight -- ms_*tree is empty.
                  */
                 for (int i = 0; i < TXG_SIZE; i++) {
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                 }
  
                 /*
@@ -1224,7 +1458,7 @@ spa_vdev_remove_thread(void *arg)
                             SM_ALLOC));
                         space_map_close(sm);
  
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                             range_tree_remove, svr->svr_allocd_segs);
  
                         /*
@@ -1243,10 +1477,34 @@ spa_vdev_remove_thread(void *arg)
                     msp->ms_id);
  
                 while (!svr->svr_thread_exit &&
-                   range_tree_space(svr->svr_allocd_segs) != 0) {
+                   !range_tree_is_empty(svr->svr_allocd_segs)) {
  
                         mutex_exit(&svr->svr_lock);
  
+                       /*
+                        * We need to periodically drop the config lock so that
+                        * writers can get in.  Additionally, we can't wait
+                        * for a txg to sync while holding a config lock
+                        * (since a waiting writer could cause a 3-way deadlock
+                        * with the sync thread, which also gets a config
+                        * lock for reader).  So we can't hold the config lock
+                        * while calling dmu_tx_assign().
+                        */
+                       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+                       /*
+                        * This delay will pause the removal around the point
+                        * specified by zfs_removal_suspend_progress. We do this
+                        * solely from the test suite or during debugging.
+                        */
+                       uint64_t bytes_copied =
+                           spa->spa_removing_phys.sr_copied;
+                       for (int i = 0; i < TXG_SIZE; i++)
+                               bytes_copied += svr->svr_bytes_done[i];
+                       while (zfs_removal_suspend_progress &&
+                           !svr->svr_thread_exit)
+                               delay(hz);
+
                         mutex_enter(&vca.vca_lock);
                         while (vca.vca_outstanding_bytes >
                             zfs_remove_max_copy_bytes) {
@@ -1260,18 +1518,37 @@ spa_vdev_remove_thread(void *arg)
                         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
                         uint64_t txg = dmu_tx_get_txg(tx);
  
+                       /*
+                        * Reacquire the vdev_config lock.  The vdev_t
+                        * that we're removing may have changed, e.g. due
+                        * to a vdev_attach or vdev_detach.
+                        */
+                       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+                       vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
                         if (txg != last_txg)
                                 max_alloc = zfs_remove_max_segment;
                         last_txg = txg;
  
-                       spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+                       spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
  
                         dmu_tx_commit(tx);
                         mutex_enter(&svr->svr_lock);
                 }
+
+               mutex_enter(&vca.vca_lock);
+               if (zfs_removal_ignore_errors == 0 &&
+                   (vca.vca_read_error_bytes > 0 ||
+                   vca.vca_write_error_bytes > 0)) {
+                       svr->svr_thread_exit = B_TRUE;
+               }
+               mutex_exit(&vca.vca_lock);
         }
  
         mutex_exit(&svr->svr_lock);
+
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
         /*
          * Wait for all copies to finish before cleaning up the vca.
          */
@@ -1287,9 +1564,24 @@ spa_vdev_remove_thread(void *arg)
                 svr->svr_thread = NULL;
                 cv_broadcast(&svr->svr_cv);
                 mutex_exit(&svr->svr_lock);
+
+               /*
+                * During the removal process an unrecoverable read or write
+                * error was encountered.  The removal process must be
+                * cancelled or this damage may become permanent.
+                */
+               if (zfs_removal_ignore_errors == 0 &&
+                   (vca.vca_read_error_bytes > 0 ||
+                   vca.vca_write_error_bytes > 0)) {
+                       zfs_dbgmsg("canceling removal due to IO errors: "
+                           "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+                           vca.vca_read_error_bytes,
+                           vca.vca_write_error_bytes);
+                       spa_vdev_remove_cancel_impl(spa);
+               }
         } else {
                 ASSERT0(range_tree_space(svr->svr_allocd_segs));
-               vdev_remove_complete(vd);
+               vdev_remove_complete(spa);
         }
  }
  
@@ -1330,7 +1622,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
  {
         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-       vdev_t *vd = svr->svr_vdev;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
         objset_t *mos = spa->spa_meta_objset;
@@ -1338,15 +1630,20 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
         ASSERT3P(svr->svr_thread, ==, NULL);
  
         spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
-       if (vdev_obsolete_counts_are_precise(vd)) {
+
+       boolean_t are_precise;
+       VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+       if (are_precise) {
                 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
                 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
                     VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
         }
  
-       if (vdev_obsolete_sm_object(vd) != 0) {
+       uint64_t obsolete_sm_object;
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+       if (obsolete_sm_object != 0) {
                 ASSERT(vd->vdev_obsolete_sm != NULL);
-               ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+               ASSERT3U(obsolete_sm_object, ==,
                     space_map_object(vd->vdev_obsolete_sm));
  
                 space_map_free(vd->vdev_obsolete_sm, tx);
@@ -1376,10 +1673,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                  * Assert nothing in flight -- ms_*tree is empty.
                  */
                 for (int i = 0; i < TXG_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                 for (int i = 0; i < TXG_DEFER_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_defertree[i]));
-               ASSERT0(range_tree_space(msp->ms_freedtree));
+                       ASSERT0(range_tree_space(msp->ms_defer[i]));
+               ASSERT0(range_tree_space(msp->ms_freed));
  
                 if (msp->ms_sm != NULL) {
                         /*
@@ -1395,7 +1692,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                         mutex_enter(&svr->svr_lock);
                         VERIFY0(space_map_load(msp->ms_sm,
                             svr->svr_allocd_segs, SM_ALLOC));
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                             range_tree_remove, svr->svr_allocd_segs);
  
                         /*
@@ -1403,8 +1700,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                          * because we have not allocated mappings for it yet.
                          */
                         uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
-                       range_tree_clear(svr->svr_allocd_segs, syncd,
-                           msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+                       uint64_t sm_end = msp->ms_sm->sm_start +
+                           msp->ms_sm->sm_size;
+                       if (sm_end > syncd)
+                               range_tree_clear(svr->svr_allocd_segs,
+                                   syncd, sm_end - syncd);
  
                         mutex_exit(&svr->svr_lock);
                 }
@@ -1457,18 +1757,14 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
             vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
  }
  
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
  {
-       spa_vdev_remove_suspend(spa);
-
-       if (spa->spa_vdev_removal == NULL)
-               return (ENOTACTIVE);
-
-       uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+       uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
  
         int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-           spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+           spa_vdev_remove_cancel_sync, NULL, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
  
         if (error == 0) {
                 spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1480,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa)
         return (error);
  }
  
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+       spa_vdev_remove_suspend(spa);
+
+       if (spa->spa_vdev_removal == NULL)
+               return (ENOTACTIVE);
+
+       return (spa_vdev_remove_cancel_impl(spa));
+}
+
  /*
   * Called every sync pass of every txg if there's a svr.
   */
@@ -1587,10 +1894,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
         vdev_dirty_leaves(vd, VDD_DTL, *txg);
         vdev_config_dirty(vd);
  
-       spa_history_log_internal(spa, "vdev remove", NULL,
-           "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
-           (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-
         spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
  
         *txg = spa_vdev_config_enter(spa);
@@ -1634,15 +1937,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
         if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
                 return (SET_ERROR(ENOTSUP));
  
+       /* available space in the pool's normal class */
+       uint64_t available = dsl_dir_space_available(
+           spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+       metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+       /*
+        * When removing a vdev from an allocation class that has
+        * remaining vdevs, include available space from the class.
+        */
+       if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+               uint64_t class_avail = metaslab_class_get_space(mc) -
+                   metaslab_class_get_alloc(mc);
+
+               /* add class space, adjusted for overhead */
+               available += (class_avail * 94) / 100;
+       }
+
         /*
          * There has to be enough free space to remove the
          * device and leave double the "slop" space (i.e. we
          * must leave at least 3% of the pool free, in addition to
          * the normal slop space).
          */
-       if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
-           NULL, 0, B_TRUE) <
-           vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+       if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
                 return (SET_ERROR(ENOSPC));
         }
  
@@ -1774,7 +2093,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
         dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
         dsl_sync_task_nowait(spa->spa_dsl_pool,
             vdev_remove_initiate_sync,
-           vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+           (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
         dmu_tx_commit(tx);
  
         return (0);
@@ -1799,12 +2118,24 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
         int error = 0;
         boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
         sysevent_t *ev = NULL;
+       char *vd_type = NULL, *vd_path = NULL;
  
         ASSERT(spa_writeable(spa));
  
         if (!locked)
                 txg = spa_vdev_enter(spa);
  
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+               if (!locked)
+                       return (spa_vdev_exit(spa, NULL, txg, error));
+
+               return (error);
+       }
+
         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
  
         if (spa->spa_spares.sav_vdevs != NULL &&
@@ -1821,11 +2152,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                         ev = spa_event_create(spa, vd, NULL,
                             ESC_ZFS_VDEV_REMOVE_AUX);
  
-                       char *nvstr = fnvlist_lookup_string(nv,
-                           ZPOOL_CONFIG_PATH);
-                       spa_history_log_internal(spa, "vdev remove", NULL,
-                           "%s vdev (%s) %s", spa_name(spa),
-                           VDEV_TYPE_SPARE, nvstr);
+                       vd_type = VDEV_TYPE_SPARE;
+                       vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
                         spa_vdev_remove_aux(spa->spa_spares.sav_config,
                             ZPOOL_CONFIG_SPARES, spares, nspares, nv);
                         spa_load_spares(spa);
@@ -1837,9 +2165,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
             nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
             (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
-               char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
-               spa_history_log_internal(spa, "vdev remove", NULL,
-                   "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
+               vd_type = VDEV_TYPE_L2CACHE;
+               vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
                 /*
                  * Cache devices can always be removed.
                  */
@@ -1851,6 +2178,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                 spa->spa_l2cache.sav_sync = B_TRUE;
         } else if (vd != NULL && vd->vdev_islog) {
                 ASSERT(!locked);
+               vd_type = "log";
+               vd_path = (vd->vdev_path != NULL) ? vd->vdev_path : "-";
                 error = spa_vdev_remove_log(vd, &txg);
         } else if (vd != NULL) {
                 ASSERT(!locked);
@@ -1865,6 +2194,18 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
         if (!locked)
                 error = spa_vdev_exit(spa, NULL, txg, error);
  
+       /*
+        * Logging must be done outside the spa config lock. Otherwise,
+        * this code path could end up holding the spa config lock while
+        * waiting for a txg_sync so it can write to the internal log.
+        * Doing that would prevent the txg sync from actually happening,
+        * causing a deadlock.
+        */
+       if (error == 0 && vd_type != NULL && vd_path != NULL) {
+               spa_history_log_internal(spa, "vdev remove", NULL,
+                   "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
+       }
+
         if (ev != NULL)
                 spa_event_post(ev);
  
@@ -1885,13 +2226,6 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
         prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
         prs->prs_copied = spa->spa_removing_phys.sr_copied;
  
-       if (spa->spa_vdev_removal != NULL) {
-               for (int i = 0; i < TXG_SIZE; i++) {
-                       prs->prs_copied +=
-                           spa->spa_vdev_removal->svr_bytes_done[i];
-               }
-       }
-
         prs->prs_mapping_memory = 0;
         uint64_t indirect_vdev_id =
             spa->spa_removing_phys.sr_prev_indirect_vdev;
@@ -1908,11 +2242,26 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
         return (0);
  }
  
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+       "Ignore hard IO errors when removing device");
+
  module_param(zfs_remove_max_segment, int, 0644);
  MODULE_PARM_DESC(zfs_remove_max_segment,
         "Largest contiguous segment to allocate when removing device");
  
+module_param(vdev_removal_max_span, int, 0644);
+MODULE_PARM_DESC(vdev_removal_max_span,
+       "Largest span of free chunks a remap segment can span");
+
+/* BEGIN CSTYLED */
+module_param(zfs_removal_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_removal_suspend_progress,
+       "Pause device removal after this many bytes are copied "
+       "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
  EXPORT_SYMBOL(free_from_removing_vdev);
  EXPORT_SYMBOL(spa_removal_get_stats);
  EXPORT_SYMBOL(spa_remove_init);