]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/vdev_removal.c
Simplify spa_sync by breaking it up to smaller functions
[mirror_zfs.git] / module / zfs / vdev_removal.c
index f9084e8cf653b7cf874be6bb021598b52145889c..8d89007872f8a912b1299621ffd4dac60a476d5c 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -44,6 +44,7 @@
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
+#include <sys/vdev_initialize.h>
 #include <sys/trace_vdev.h>
 
 /*
@@ -80,6 +81,8 @@
 typedef struct vdev_copy_arg {
        metaslab_t      *vca_msp;
        uint64_t        vca_outstanding_bytes;
+       uint64_t        vca_read_error_bytes;
+       uint64_t        vca_write_error_bytes;
        kcondvar_t      vca_cv;
        kmutex_t        vca_lock;
 } vdev_copy_arg_t;
@@ -99,6 +102,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  */
 int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
+/*
+ * Ignore hard IO errors during device removal.  When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled.  This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
 /*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
@@ -117,9 +128,16 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
  */
 int vdev_removal_max_span = 32 * 1024;
 
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+int zfs_removal_suspend_progress = 0;
+
 #define        VDEV_REMOVAL_ZAP_OBJS   "lzap"
 
 static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
@@ -245,7 +263,9 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
                    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
                    &one, tx));
-               ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+               ASSERTV(boolean_t are_precise);
+               ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+               ASSERT3B(are_precise, ==, B_TRUE);
        }
 
        vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
@@ -286,11 +306,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                 * be copied.
                 */
                spa->spa_removing_phys.sr_to_copy -=
-                   range_tree_space(ms->ms_freeingtree);
+                   range_tree_space(ms->ms_freeing);
 
-               ASSERT0(range_tree_space(ms->ms_freedtree));
+               ASSERT0(range_tree_space(ms->ms_freed));
                for (int t = 0; t < TXG_SIZE; t++)
-                       ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+                       ASSERT0(range_tree_space(ms->ms_allocating[t]));
        }
 
        /*
@@ -467,19 +487,18 @@ spa_restart_removal(spa_t *spa)
  * and we correctly free already-copied data.
  */
 void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
        spa_t *spa = vd->vdev_spa;
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+       uint64_t txg = spa_syncing_txg(spa);
        uint64_t max_offset_yet = 0;
 
        ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
        ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
            vdev_indirect_mapping_object(vim));
        ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
-       ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
        mutex_enter(&svr->svr_lock);
 
@@ -494,8 +513,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * held, so that the remove_thread can not load this metaslab and then
         * visit this offset between the time that we metaslab_free_concrete()
         * and when we check to see if it has been visited.
+        *
+        * Note: The checkpoint flag is set to false as having/taking
+        * a checkpoint and removing a device can't happen at the same
+        * time.
         */
-       metaslab_free_concrete(vd, offset, size, txg);
+       ASSERT(!spa_has_checkpoint(spa));
+       metaslab_free_concrete(vd, offset, size, B_FALSE);
 
        uint64_t synced_size = 0;
        uint64_t synced_offset = 0;
@@ -627,16 +651,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * of this free.
         */
        if (synced_size > 0) {
-               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-                   txg);
+               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
                /*
                 * Note: this can only be called from syncing context,
                 * and the vdev_indirect_mapping is only changed from the
                 * sync thread, so we don't need svr_lock while doing
                 * metaslab_free_impl_cb.
                 */
+               boolean_t checkpoint = B_FALSE;
                vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
        }
 }
 
@@ -659,7 +684,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
                vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
                vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
-               if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+               if (srp->sr_prev_indirect_vdev != -1) {
                        vdev_t *pvd;
                        pvd = vdev_lookup_top(spa,
                            srp->sr_prev_indirect_vdev);
@@ -684,10 +709,10 @@ static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
        vdev_t *vd = arg;
-       vdev_indirect_mark_obsolete(vd, offset, size,
-           vd->vdev_spa->spa_syncing_txg);
+       vdev_indirect_mark_obsolete(vd, offset, size);
+       boolean_t checkpoint = B_FALSE;
        vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-           metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+           metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
@@ -789,6 +814,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 
        mutex_enter(&vca->vca_lock);
        vca->vca_outstanding_bytes -= zio->io_size;
+
+       if (zio->io_error != 0)
+               vca->vca_write_error_bytes += zio->io_size;
+
        cv_signal(&vca->vca_cv);
        mutex_exit(&vca->vca_lock);
 }
@@ -800,6 +829,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
+       vdev_copy_arg_t *vca = zio->io_private;
+
+       if (zio->io_error != 0) {
+               mutex_enter(&vca->vca_lock);
+               vca->vca_read_error_bytes += zio->io_size;
+               mutex_exit(&vca->vca_lock);
+       }
+
        zio_nowait(zio_unique_parent(zio));
 }
 
@@ -853,25 +890,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
 {
        ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
+       /*
+        * If the destination child in unwritable then there is no point
+        * in issuing the source reads which cannot be written.
+        */
+       if (!vdev_writeable(dest_child_vd))
+               return;
+
        mutex_enter(&vca->vca_lock);
        vca->vca_outstanding_bytes += size;
        mutex_exit(&vca->vca_lock);
 
        abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
-       vdev_t *source_child_vd;
+       vdev_t *source_child_vd = NULL;
        if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
                /*
                 * Source and dest are both mirrors.  Copy from the same
                 * child id as we are copying to (wrapping around if there
-                * are more dest children than source children).
+                * are more dest children than source children).  If the
+                * preferred source child is unreadable select another.
                 */
-               source_child_vd =
-                   source_vd->vdev_child[dest_id % source_vd->vdev_children];
+               for (int i = 0; i < source_vd->vdev_children; i++) {
+                       source_child_vd = source_vd->vdev_child[
+                           (dest_id + i) % source_vd->vdev_children];
+                       if (vdev_readable(source_child_vd))
+                               break;
+               }
        } else {
                source_child_vd = source_vd;
        }
 
+       /*
+        * There should always be at least one readable source child or
+        * the pool would be in a suspended state.  Somehow selecting an
+        * unreadable child would result in IO errors, the removal process
+        * being cancelled, and the pool reverting to its pre-removal state.
+        */
+       ASSERT3P(source_child_vd, !=, NULL);
+
        zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
            dest_child_vd, dest_offset, abd, size,
            ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
@@ -933,8 +990,18 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
        }
        ASSERT3U(size, <=, maxalloc);
 
-       int error = metaslab_alloc_dva(spa, mg->mg_class, size,
-           &dst, 0, NULL, txg, 0, zal);
+       /*
+        * An allocation class might not have any remaining vdevs or space
+        */
+       metaslab_class_t *mc = mg->mg_class;
+       if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+               mc = spa_normal_class(spa);
+       int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+           zal, 0);
+       if (error == ENOSPC && mc != spa_normal_class(spa)) {
+               error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+                   &dst, 0, NULL, txg, 0, zal, 0);
+       }
        if (error != 0)
                return (error);
 
@@ -1092,19 +1159,16 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 
        ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
-       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-       dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
-           0, ZFS_SPACE_CHECK_NONE, tx);
-       dmu_tx_commit(tx);
-
-       /*
-        * Indicate that this thread has exited.
-        * After this, we can not use svr.
-        */
        mutex_enter(&svr->svr_lock);
        svr->svr_thread = NULL;
        cv_broadcast(&svr->svr_cv);
        mutex_exit(&svr->svr_lock);
+
+       /* After this, we can not use svr. */
+       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+       dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+           0, ZFS_SPACE_CHECK_NONE, tx);
+       dmu_tx_commit(tx);
 }
 
 /*
@@ -1123,6 +1187,7 @@ vdev_remove_complete(spa_t *spa)
        txg_wait_synced(spa->spa_dsl_pool, 0);
        txg = spa_vdev_enter(spa);
        vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+       ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
        sysevent_t *ev = spa_event_create(spa, vd, NULL,
            ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1341,6 +1406,8 @@ spa_vdev_remove_thread(void *arg)
        mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
        vca.vca_outstanding_bytes = 0;
+       vca.vca_read_error_bytes = 0;
+       vca.vca_write_error_bytes = 0;
 
        mutex_enter(&svr->svr_lock);
 
@@ -1363,7 +1430,7 @@ spa_vdev_remove_thread(void *arg)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++) {
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                }
 
                /*
@@ -1393,7 +1460,7 @@ spa_vdev_remove_thread(void *arg)
                            SM_ALLOC));
                        space_map_close(sm);
 
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1412,7 +1479,7 @@ spa_vdev_remove_thread(void *arg)
                    msp->ms_id);
 
                while (!svr->svr_thread_exit &&
-                   range_tree_space(svr->svr_allocd_segs) != 0) {
+                   !range_tree_is_empty(svr->svr_allocd_segs)) {
 
                        mutex_exit(&svr->svr_lock);
 
@@ -1427,6 +1494,19 @@ spa_vdev_remove_thread(void *arg)
                         */
                        spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+                       /*
+                        * This delay will pause the removal around the point
+                        * specified by zfs_removal_suspend_progress. We do this
+                        * solely from the test suite or during debugging.
+                        */
+                       uint64_t bytes_copied =
+                           spa->spa_removing_phys.sr_copied;
+                       for (int i = 0; i < TXG_SIZE; i++)
+                               bytes_copied += svr->svr_bytes_done[i];
+                       while (zfs_removal_suspend_progress &&
+                           !svr->svr_thread_exit)
+                               delay(hz);
+
                        mutex_enter(&vca.vca_lock);
                        while (vca.vca_outstanding_bytes >
                            zfs_remove_max_copy_bytes) {
@@ -1457,6 +1537,14 @@ spa_vdev_remove_thread(void *arg)
                        dmu_tx_commit(tx);
                        mutex_enter(&svr->svr_lock);
                }
+
+               mutex_enter(&vca.vca_lock);
+               if (zfs_removal_ignore_errors == 0 &&
+                   (vca.vca_read_error_bytes > 0 ||
+                   vca.vca_write_error_bytes > 0)) {
+                       svr->svr_thread_exit = B_TRUE;
+               }
+               mutex_exit(&vca.vca_lock);
        }
 
        mutex_exit(&svr->svr_lock);
@@ -1478,6 +1566,21 @@ spa_vdev_remove_thread(void *arg)
                svr->svr_thread = NULL;
                cv_broadcast(&svr->svr_cv);
                mutex_exit(&svr->svr_lock);
+
+               /*
+                * During the removal process an unrecoverable read or write
+                * error was encountered.  The removal process must be
+                * cancelled or this damage may become permanent.
+                */
+               if (zfs_removal_ignore_errors == 0 &&
+                   (vca.vca_read_error_bytes > 0 ||
+                   vca.vca_write_error_bytes > 0)) {
+                       zfs_dbgmsg("canceling removal due to IO errors: "
+                           "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+                           vca.vca_read_error_bytes,
+                           vca.vca_write_error_bytes);
+                       spa_vdev_remove_cancel_impl(spa);
+               }
        } else {
                ASSERT0(range_tree_space(svr->svr_allocd_segs));
                vdev_remove_complete(spa);
@@ -1529,15 +1632,20 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
        ASSERT3P(svr->svr_thread, ==, NULL);
 
        spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
-       if (vdev_obsolete_counts_are_precise(vd)) {
+
+       boolean_t are_precise;
+       VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+       if (are_precise) {
                spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
                VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
                    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
        }
 
-       if (vdev_obsolete_sm_object(vd) != 0) {
+       uint64_t obsolete_sm_object;
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+       if (obsolete_sm_object != 0) {
                ASSERT(vd->vdev_obsolete_sm != NULL);
-               ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+               ASSERT3U(obsolete_sm_object, ==,
                    space_map_object(vd->vdev_obsolete_sm));
 
                space_map_free(vd->vdev_obsolete_sm, tx);
@@ -1567,10 +1675,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                for (int i = 0; i < TXG_DEFER_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_defertree[i]));
-               ASSERT0(range_tree_space(msp->ms_freedtree));
+                       ASSERT0(range_tree_space(msp->ms_defer[i]));
+               ASSERT0(range_tree_space(msp->ms_freed));
 
                if (msp->ms_sm != NULL) {
                        /*
@@ -1586,7 +1694,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                        mutex_enter(&svr->svr_lock);
                        VERIFY0(space_map_load(msp->ms_sm,
                            svr->svr_allocd_segs, SM_ALLOC));
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1651,18 +1759,14 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
            vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
 {
-       spa_vdev_remove_suspend(spa);
-
-       if (spa->spa_vdev_removal == NULL)
-               return (ENOTACTIVE);
-
        uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
        int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-           spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+           spa_vdev_remove_cancel_sync, NULL, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
 
        if (error == 0) {
                spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1674,6 +1778,17 @@ spa_vdev_remove_cancel(spa_t *spa)
        return (error);
 }
 
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+       spa_vdev_remove_suspend(spa);
+
+       if (spa->spa_vdev_removal == NULL)
+               return (ENOTACTIVE);
+
+       return (spa_vdev_remove_cancel_impl(spa));
+}
+
 /*
  * Called every sync pass of every txg if there's a svr.
  */
@@ -1683,6 +1798,9 @@ svr_sync(spa_t *spa, dmu_tx_t *tx)
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
+       if (svr == NULL)
+               return;
+
        /*
         * This check is necessary so that we do not dirty the
         * DIRECTORY_OBJECT via spa_sync_removing_state() when there
@@ -1781,12 +1899,11 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
        vdev_dirty_leaves(vd, VDD_DTL, *txg);
        vdev_config_dirty(vd);
 
-       spa_history_log_internal(spa, "vdev remove", NULL,
-           "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
-           (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-
        spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
+       /* Stop initializing */
+       vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
        *txg = spa_vdev_config_enter(spa);
 
        sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -1828,15 +1945,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
        if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
                return (SET_ERROR(ENOTSUP));
 
+       /* available space in the pool's normal class */
+       uint64_t available = dsl_dir_space_available(
+           spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+       metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+       /*
+        * When removing a vdev from an allocation class that has
+        * remaining vdevs, include available space from the class.
+        */
+       if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+               uint64_t class_avail = metaslab_class_get_space(mc) -
+                   metaslab_class_get_alloc(mc);
+
+               /* add class space, adjusted for overhead */
+               available += (class_avail * 94) / 100;
+       }
+
        /*
         * There has to be enough free space to remove the
         * device and leave double the "slop" space (i.e. we
         * must leave at least 3% of the pool free, in addition to
         * the normal slop space).
         */
-       if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
-           NULL, 0, B_TRUE) <
-           vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+       if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
                return (SET_ERROR(ENOSPC));
        }
 
@@ -1947,6 +2080,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
         */
        error = spa_reset_logs(spa);
 
+       /*
+        * We stop any initializing that is currently in progress but leave
+        * the state as "active". This will allow the initializing to resume
+        * if the removal is canceled sometime later.
+        */
+       vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
        *txg = spa_vdev_config_enter(spa);
 
        /*
@@ -1958,6 +2098,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 
        if (error != 0) {
                metaslab_group_activate(mg);
+               spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
                return (error);
        }
 
@@ -1993,12 +2134,24 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
        int error = 0;
        boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
        sysevent_t *ev = NULL;
+       char *vd_type = NULL, *vd_path = NULL;
 
        ASSERT(spa_writeable(spa));
 
        if (!locked)
                txg = spa_vdev_enter(spa);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+               if (!locked)
+                       return (spa_vdev_exit(spa, NULL, txg, error));
+
+               return (error);
+       }
+
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
        if (spa->spa_spares.sav_vdevs != NULL &&
@@ -2015,11 +2168,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                        ev = spa_event_create(spa, vd, NULL,
                            ESC_ZFS_VDEV_REMOVE_AUX);
 
-                       char *nvstr = fnvlist_lookup_string(nv,
-                           ZPOOL_CONFIG_PATH);
-                       spa_history_log_internal(spa, "vdev remove", NULL,
-                           "%s vdev (%s) %s", spa_name(spa),
-                           VDEV_TYPE_SPARE, nvstr);
+                       vd_type = VDEV_TYPE_SPARE;
+                       vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
                        spa_vdev_remove_aux(spa->spa_spares.sav_config,
                            ZPOOL_CONFIG_SPARES, spares, nspares, nv);
                        spa_load_spares(spa);
@@ -2031,9 +2181,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
            nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
            ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
            (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
-               char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
-               spa_history_log_internal(spa, "vdev remove", NULL,
-                   "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
+               vd_type = VDEV_TYPE_L2CACHE;
+               vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
                /*
                 * Cache devices can always be removed.
                 */
@@ -2045,6 +2194,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                spa->spa_l2cache.sav_sync = B_TRUE;
        } else if (vd != NULL && vd->vdev_islog) {
                ASSERT(!locked);
+               vd_type = "log";
+               vd_path = (vd->vdev_path != NULL) ? vd->vdev_path : "-";
                error = spa_vdev_remove_log(vd, &txg);
        } else if (vd != NULL) {
                ASSERT(!locked);
@@ -2059,6 +2210,18 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
        if (!locked)
                error = spa_vdev_exit(spa, NULL, txg, error);
 
+       /*
+        * Logging must be done outside the spa config lock. Otherwise,
+        * this code path could end up holding the spa config lock while
+        * waiting for a txg_sync so it can write to the internal log.
+        * Doing that would prevent the txg sync from actually happening,
+        * causing a deadlock.
+        */
+       if (error == 0 && vd_type != NULL && vd_path != NULL) {
+               spa_history_log_internal(spa, "vdev remove", NULL,
+                   "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
+       }
+
        if (ev != NULL)
                spa_event_post(ev);
 
@@ -2079,13 +2242,6 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
        prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
        prs->prs_copied = spa->spa_removing_phys.sr_copied;
 
-       if (spa->spa_vdev_removal != NULL) {
-               for (int i = 0; i < TXG_SIZE; i++) {
-                       prs->prs_copied +=
-                           spa->spa_vdev_removal->svr_bytes_done[i];
-               }
-       }
-
        prs->prs_mapping_memory = 0;
        uint64_t indirect_vdev_id =
            spa->spa_removing_phys.sr_prev_indirect_vdev;
@@ -2103,6 +2259,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 }
 
 #if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+       "Ignore hard IO errors when removing device");
+
 module_param(zfs_remove_max_segment, int, 0644);
 MODULE_PARM_DESC(zfs_remove_max_segment,
        "Largest contiguous segment to allocate when removing device");
@@ -2111,6 +2271,13 @@ module_param(vdev_removal_max_span, int, 0644);
 MODULE_PARM_DESC(vdev_removal_max_span,
        "Largest span of free chunks a remap segment can span");
 
+/* BEGIN CSTYLED */
+module_param(zfs_removal_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_removal_suspend_progress,
+       "Pause device removal after this many bytes are copied "
+       "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);