]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/vdev_removal.c
Detect IO errors during device removal
[mirror_zfs.git] / module / zfs / vdev_removal.c
index 4e4a6c4f5a250d0554e411860cf3fc7efd853dbd..5952a5d8fc68fa9fe42cb4a81169215f1f94e2df 100644 (file)
@@ -80,6 +80,8 @@
 typedef struct vdev_copy_arg {
        metaslab_t      *vca_msp;
        uint64_t        vca_outstanding_bytes;
+       uint64_t        vca_read_error_bytes;
+       uint64_t        vca_write_error_bytes;
        kcondvar_t      vca_cv;
        kmutex_t        vca_lock;
 } vdev_copy_arg_t;
@@ -99,6 +101,14 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  */
 int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
+/*
+ * Ignore hard IO errors during device removal.  When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled.  This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
 /*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
@@ -121,11 +131,12 @@ int vdev_removal_max_span = 32 * 1024;
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
-unsigned long zfs_remove_max_bytes_pause = -1UL;
+int zfs_removal_suspend_progress = 0;
 
 #define        VDEV_REMOVAL_ZAP_OBJS   "lzap"
 
 static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
@@ -672,7 +683,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
                vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
                vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
-               if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+               if (srp->sr_prev_indirect_vdev != -1) {
                        vdev_t *pvd;
                        pvd = vdev_lookup_top(spa,
                            srp->sr_prev_indirect_vdev);
@@ -802,6 +813,10 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 
        mutex_enter(&vca->vca_lock);
        vca->vca_outstanding_bytes -= zio->io_size;
+
+       if (zio->io_error != 0)
+               vca->vca_write_error_bytes += zio->io_size;
+
        cv_signal(&vca->vca_cv);
        mutex_exit(&vca->vca_lock);
 }
@@ -813,6 +828,14 @@ spa_vdev_copy_segment_write_done(zio_t *zio)
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
+       vdev_copy_arg_t *vca = zio->io_private;
+
+       if (zio->io_error != 0) {
+               mutex_enter(&vca->vca_lock);
+               vca->vca_read_error_bytes += zio->io_size;
+               mutex_exit(&vca->vca_lock);
+       }
+
        zio_nowait(zio_unique_parent(zio));
 }
 
@@ -866,25 +889,45 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
 {
        ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
+       /*
+        * If the destination child in unwritable then there is no point
+        * in issuing the source reads which cannot be written.
+        */
+       if (!vdev_writeable(dest_child_vd))
+               return;
+
        mutex_enter(&vca->vca_lock);
        vca->vca_outstanding_bytes += size;
        mutex_exit(&vca->vca_lock);
 
        abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
-       vdev_t *source_child_vd;
+       vdev_t *source_child_vd = NULL;
        if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
                /*
                 * Source and dest are both mirrors.  Copy from the same
                 * child id as we are copying to (wrapping around if there
-                * are more dest children than source children).
+                * are more dest children than source children).  If the
+                * preferred source child is unreadable select another.
                 */
-               source_child_vd =
-                   source_vd->vdev_child[dest_id % source_vd->vdev_children];
+               for (int i = 0; i < source_vd->vdev_children; i++) {
+                       source_child_vd = source_vd->vdev_child[
+                           (dest_id + i) % source_vd->vdev_children];
+                       if (vdev_readable(source_child_vd))
+                               break;
+               }
        } else {
                source_child_vd = source_vd;
        }
 
+       /*
+        * There should always be at least one readable source child or
+        * the pool would be in a suspended state.  Somehow selecting an
+        * unreadable child would result in IO errors, the removal process
+        * being cancelled, and the pool reverting to its pre-removal state.
+        */
+       ASSERT3P(source_child_vd, !=, NULL);
+
        zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
            dest_child_vd, dest_offset, abd, size,
            ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
@@ -1115,19 +1158,16 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 
        ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
-       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-       dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
-           0, ZFS_SPACE_CHECK_NONE, tx);
-       dmu_tx_commit(tx);
-
-       /*
-        * Indicate that this thread has exited.
-        * After this, we can not use svr.
-        */
        mutex_enter(&svr->svr_lock);
        svr->svr_thread = NULL;
        cv_broadcast(&svr->svr_cv);
        mutex_exit(&svr->svr_lock);
+
+       /* After this, we can not use svr. */
+       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+       dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+           0, ZFS_SPACE_CHECK_NONE, tx);
+       dmu_tx_commit(tx);
 }
 
 /*
@@ -1364,6 +1404,8 @@ spa_vdev_remove_thread(void *arg)
        mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
        vca.vca_outstanding_bytes = 0;
+       vca.vca_read_error_bytes = 0;
+       vca.vca_write_error_bytes = 0;
 
        mutex_enter(&svr->svr_lock);
 
@@ -1452,14 +1494,14 @@ spa_vdev_remove_thread(void *arg)
 
                        /*
                         * This delay will pause the removal around the point
-                        * specified by zfs_remove_max_bytes_pause. We do this
+                        * specified by zfs_removal_suspend_progress. We do this
                         * solely from the test suite or during debugging.
                         */
                        uint64_t bytes_copied =
                            spa->spa_removing_phys.sr_copied;
                        for (int i = 0; i < TXG_SIZE; i++)
                                bytes_copied += svr->svr_bytes_done[i];
-                       while (zfs_remove_max_bytes_pause <= bytes_copied &&
+                       while (zfs_removal_suspend_progress &&
                            !svr->svr_thread_exit)
                                delay(hz);
 
@@ -1493,6 +1535,14 @@ spa_vdev_remove_thread(void *arg)
                        dmu_tx_commit(tx);
                        mutex_enter(&svr->svr_lock);
                }
+
+               mutex_enter(&vca.vca_lock);
+               if (zfs_removal_ignore_errors == 0 &&
+                   (vca.vca_read_error_bytes > 0 ||
+                   vca.vca_write_error_bytes > 0)) {
+                       svr->svr_thread_exit = B_TRUE;
+               }
+               mutex_exit(&vca.vca_lock);
        }
 
        mutex_exit(&svr->svr_lock);
@@ -1514,6 +1564,21 @@ spa_vdev_remove_thread(void *arg)
                svr->svr_thread = NULL;
                cv_broadcast(&svr->svr_cv);
                mutex_exit(&svr->svr_lock);
+
+               /*
+                * During the removal process an unrecoverable read or write
+                * error was encountered.  The removal process must be
+                * cancelled or this damage may become permanent.
+                */
+               if (zfs_removal_ignore_errors == 0 &&
+                   (vca.vca_read_error_bytes > 0 ||
+                   vca.vca_write_error_bytes > 0)) {
+                       zfs_dbgmsg("canceling removal due to IO errors: "
+                           "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+                           vca.vca_read_error_bytes,
+                           vca.vca_write_error_bytes);
+                       spa_vdev_remove_cancel_impl(spa);
+               }
        } else {
                ASSERT0(range_tree_space(svr->svr_allocd_segs));
                vdev_remove_complete(spa);
@@ -1692,14 +1757,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
            vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
 {
-       spa_vdev_remove_suspend(spa);
-
-       if (spa->spa_vdev_removal == NULL)
-               return (ENOTACTIVE);
-
        uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
        int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
@@ -1716,6 +1776,17 @@ spa_vdev_remove_cancel(spa_t *spa)
        return (error);
 }
 
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+       spa_vdev_remove_suspend(spa);
+
+       if (spa->spa_vdev_removal == NULL)
+               return (ENOTACTIVE);
+
+       return (spa_vdev_remove_cancel_impl(spa));
+}
+
 /*
  * Called every sync pass of every txg if there's a svr.
  */
@@ -2148,13 +2219,6 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
        prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
        prs->prs_copied = spa->spa_removing_phys.sr_copied;
 
-       if (spa->spa_vdev_removal != NULL) {
-               for (int i = 0; i < TXG_SIZE; i++) {
-                       prs->prs_copied +=
-                           spa->spa_vdev_removal->svr_bytes_done[i];
-               }
-       }
-
        prs->prs_mapping_memory = 0;
        uint64_t indirect_vdev_id =
            spa->spa_removing_phys.sr_prev_indirect_vdev;
@@ -2172,6 +2236,10 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 }
 
 #if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+       "Ignore hard IO errors when removing device");
+
 module_param(zfs_remove_max_segment, int, 0644);
 MODULE_PARM_DESC(zfs_remove_max_segment,
        "Largest contiguous segment to allocate when removing device");
@@ -2181,8 +2249,8 @@ MODULE_PARM_DESC(vdev_removal_max_span,
        "Largest span of free chunks a remap segment can span");
 
 /* BEGIN CSTYLED */
-module_param(zfs_remove_max_bytes_pause, ulong, 0644);
-MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+module_param(zfs_removal_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_removal_suspend_progress,
        "Pause device removal after this many bytes are copied "
        "(debug use only - causes removal to hang)");
 /* END CSTYLED */