]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/vdev_removal.c
Improved error handling for extreme rewinds
[mirror_zfs.git] / module / zfs / vdev_removal.c
index 07c556ed059af44e7b88aebb55610fea2e090a06..4e4a6c4f5a250d0554e411860cf3fc7efd853dbd 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -117,6 +117,12 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
  */
 int vdev_removal_max_span = 32 * 1024;
 
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+unsigned long zfs_remove_max_bytes_pause = -1UL;
+
 #define        VDEV_REMOVAL_ZAP_OBJS   "lzap"
 
 static void spa_vdev_remove_thread(void *arg);
@@ -245,7 +251,9 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
                    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
                    &one, tx));
-               ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+               ASSERTV(boolean_t are_precise);
+               ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+               ASSERT3B(are_precise, ==, B_TRUE);
        }
 
        vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
@@ -286,11 +294,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                 * be copied.
                 */
                spa->spa_removing_phys.sr_to_copy -=
-                   range_tree_space(ms->ms_freeingtree);
+                   range_tree_space(ms->ms_freeing);
 
-               ASSERT0(range_tree_space(ms->ms_freedtree));
+               ASSERT0(range_tree_space(ms->ms_freed));
                for (int t = 0; t < TXG_SIZE; t++)
-                       ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+                       ASSERT0(range_tree_space(ms->ms_allocating[t]));
        }
 
        /*
@@ -467,19 +475,18 @@ spa_restart_removal(spa_t *spa)
  * and we correctly free already-copied data.
  */
 void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
        spa_t *spa = vd->vdev_spa;
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+       uint64_t txg = spa_syncing_txg(spa);
        uint64_t max_offset_yet = 0;
 
        ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
        ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
            vdev_indirect_mapping_object(vim));
        ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
-       ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
        mutex_enter(&svr->svr_lock);
 
@@ -494,8 +501,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * held, so that the remove_thread can not load this metaslab and then
         * visit this offset between the time that we metaslab_free_concrete()
         * and when we check to see if it has been visited.
+        *
+        * Note: The checkpoint flag is set to false as having/taking
+        * a checkpoint and removing a device can't happen at the same
+        * time.
         */
-       metaslab_free_concrete(vd, offset, size, txg);
+       ASSERT(!spa_has_checkpoint(spa));
+       metaslab_free_concrete(vd, offset, size, B_FALSE);
 
        uint64_t synced_size = 0;
        uint64_t synced_offset = 0;
@@ -627,16 +639,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * of this free.
         */
        if (synced_size > 0) {
-               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-                   txg);
+               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
                /*
                 * Note: this can only be called from syncing context,
                 * and the vdev_indirect_mapping is only changed from the
                 * sync thread, so we don't need svr_lock while doing
                 * metaslab_free_impl_cb.
                 */
+               boolean_t checkpoint = B_FALSE;
                vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
        }
 }
 
@@ -684,10 +697,10 @@ static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
        vdev_t *vd = arg;
-       vdev_indirect_mark_obsolete(vd, offset, size,
-           vd->vdev_spa->spa_syncing_txg);
+       vdev_indirect_mark_obsolete(vd, offset, size);
+       boolean_t checkpoint = B_FALSE;
        vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-           metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+           metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
@@ -933,8 +946,18 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
        }
        ASSERT3U(size, <=, maxalloc);
 
-       int error = metaslab_alloc_dva(spa, mg->mg_class, size,
-           &dst, 0, NULL, txg, 0, zal);
+       /*
+        * An allocation class might not have any remaining vdevs or space
+        */
+       metaslab_class_t *mc = mg->mg_class;
+       if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+               mc = spa_normal_class(spa);
+       int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+           zal, 0);
+       if (error == ENOSPC && mc != spa_normal_class(spa)) {
+               error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+                   &dst, 0, NULL, txg, 0, zal, 0);
+       }
        if (error != 0)
                return (error);
 
@@ -1363,7 +1386,7 @@ spa_vdev_remove_thread(void *arg)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++) {
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                }
 
                /*
@@ -1393,7 +1416,7 @@ spa_vdev_remove_thread(void *arg)
                            SM_ALLOC));
                        space_map_close(sm);
 
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1412,7 +1435,7 @@ spa_vdev_remove_thread(void *arg)
                    msp->ms_id);
 
                while (!svr->svr_thread_exit &&
-                   range_tree_space(svr->svr_allocd_segs) != 0) {
+                   !range_tree_is_empty(svr->svr_allocd_segs)) {
 
                        mutex_exit(&svr->svr_lock);
 
@@ -1427,6 +1450,19 @@ spa_vdev_remove_thread(void *arg)
                         */
                        spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+                       /*
+                        * This delay will pause the removal around the point
+                        * specified by zfs_remove_max_bytes_pause. We do this
+                        * solely from the test suite or during debugging.
+                        */
+                       uint64_t bytes_copied =
+                           spa->spa_removing_phys.sr_copied;
+                       for (int i = 0; i < TXG_SIZE; i++)
+                               bytes_copied += svr->svr_bytes_done[i];
+                       while (zfs_remove_max_bytes_pause <= bytes_copied &&
+                           !svr->svr_thread_exit)
+                               delay(hz);
+
                        mutex_enter(&vca.vca_lock);
                        while (vca.vca_outstanding_bytes >
                            zfs_remove_max_copy_bytes) {
@@ -1529,15 +1565,20 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
        ASSERT3P(svr->svr_thread, ==, NULL);
 
        spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
-       if (vdev_obsolete_counts_are_precise(vd)) {
+
+       boolean_t are_precise;
+       VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+       if (are_precise) {
                spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
                VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
                    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
        }
 
-       if (vdev_obsolete_sm_object(vd) != 0) {
+       uint64_t obsolete_sm_object;
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+       if (obsolete_sm_object != 0) {
                ASSERT(vd->vdev_obsolete_sm != NULL);
-               ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+               ASSERT3U(obsolete_sm_object, ==,
                    space_map_object(vd->vdev_obsolete_sm));
 
                space_map_free(vd->vdev_obsolete_sm, tx);
@@ -1567,10 +1608,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                for (int i = 0; i < TXG_DEFER_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_defertree[i]));
-               ASSERT0(range_tree_space(msp->ms_freedtree));
+                       ASSERT0(range_tree_space(msp->ms_defer[i]));
+               ASSERT0(range_tree_space(msp->ms_freed));
 
                if (msp->ms_sm != NULL) {
                        /*
@@ -1586,7 +1627,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                        mutex_enter(&svr->svr_lock);
                        VERIFY0(space_map_load(msp->ms_sm,
                            svr->svr_allocd_segs, SM_ALLOC));
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1662,7 +1703,8 @@ spa_vdev_remove_cancel(spa_t *spa)
        uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
        int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-           spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+           spa_vdev_remove_cancel_sync, NULL, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
 
        if (error == 0) {
                spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1828,15 +1870,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
        if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
                return (SET_ERROR(ENOTSUP));
 
+       /* available space in the pool's normal class */
+       uint64_t available = dsl_dir_space_available(
+           spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+       metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+       /*
+        * When removing a vdev from an allocation class that has
+        * remaining vdevs, include available space from the class.
+        */
+       if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+               uint64_t class_avail = metaslab_class_get_space(mc) -
+                   metaslab_class_get_alloc(mc);
+
+               /* add class space, adjusted for overhead */
+               available += (class_avail * 94) / 100;
+       }
+
        /*
         * There has to be enough free space to remove the
         * device and leave double the "slop" space (i.e. we
         * must leave at least 3% of the pool free, in addition to
         * the normal slop space).
         */
-       if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
-           NULL, 0, B_TRUE) <
-           vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+       if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
                return (SET_ERROR(ENOSPC));
        }
 
@@ -1999,6 +2057,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
        if (!locked)
                txg = spa_vdev_enter(spa);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+               if (!locked)
+                       return (spa_vdev_exit(spa, NULL, txg, error));
+
+               return (error);
+       }
+
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
        if (spa->spa_spares.sav_vdevs != NULL &&
@@ -2102,7 +2171,7 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
        return (0);
 }
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
 module_param(zfs_remove_max_segment, int, 0644);
 MODULE_PARM_DESC(zfs_remove_max_segment,
        "Largest contiguous segment to allocate when removing device");
@@ -2111,6 +2180,13 @@ module_param(vdev_removal_max_span, int, 0644);
 MODULE_PARM_DESC(vdev_removal_max_span,
        "Largest span of free chunks a remap segment can span");
 
+/* BEGIN CSTYLED */
+module_param(zfs_remove_max_bytes_pause, ulong, 0644);
+MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+       "Pause device removal after this many bytes are copied "
+       "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);