/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
*/
int vdev_removal_max_span = 32 * 1024;
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+unsigned long zfs_remove_max_bytes_pause = -1UL;
+
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
static void spa_vdev_remove_thread(void *arg);
VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
&one, tx));
- ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+ ASSERTV(boolean_t are_precise);
+ ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ ASSERT3B(are_precise, ==, B_TRUE);
}
vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
* be copied.
*/
spa->spa_removing_phys.sr_to_copy -=
- range_tree_space(ms->ms_freeingtree);
+ range_tree_space(ms->ms_freeing);
- ASSERT0(range_tree_space(ms->ms_freedtree));
+ ASSERT0(range_tree_space(ms->ms_freed));
for (int t = 0; t < TXG_SIZE; t++)
- ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+ ASSERT0(range_tree_space(ms->ms_allocating[t]));
}
/*
* and we correctly free already-copied data.
*/
void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
- uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
{
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t txg = spa_syncing_txg(spa);
uint64_t max_offset_yet = 0;
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
vdev_indirect_mapping_object(vim));
ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
- ASSERT3U(spa_syncing_txg(spa), ==, txg);
mutex_enter(&svr->svr_lock);
* held, so that the remove_thread can not load this metaslab and then
* visit this offset between the time that we metaslab_free_concrete()
* and when we check to see if it has been visited.
+ *
+ * Note: The checkpoint flag is set to false as having/taking
+ * a checkpoint and removing a device can't happen at the same
+ * time.
*/
- metaslab_free_concrete(vd, offset, size, txg);
+ ASSERT(!spa_has_checkpoint(spa));
+ metaslab_free_concrete(vd, offset, size, B_FALSE);
uint64_t synced_size = 0;
uint64_t synced_offset = 0;
* of this free.
*/
if (synced_size > 0) {
- vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
- txg);
+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
/*
* Note: this can only be called from syncing context,
* and the vdev_indirect_mapping is only changed from the
* sync thread, so we don't need svr_lock while doing
* metaslab_free_impl_cb.
*/
+ boolean_t checkpoint = B_FALSE;
vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
- metaslab_free_impl_cb, &txg);
+ metaslab_free_impl_cb, &checkpoint);
}
}
free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
{
vdev_t *vd = arg;
- vdev_indirect_mark_obsolete(vd, offset, size,
- vd->vdev_spa->spa_syncing_txg);
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ boolean_t checkpoint = B_FALSE;
vdev_indirect_ops.vdev_op_remap(vd, offset, size,
- metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+ metaslab_free_impl_cb, &checkpoint);
}
/*
}
ASSERT3U(size, <=, maxalloc);
- int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal);
+ /*
+ * An allocation class might not have any remaining vdevs or space
+ */
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
if (error != 0)
return (error);
* Assert nothing in flight -- ms_*tree is empty.
*/
for (int i = 0; i < TXG_SIZE; i++) {
- ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
}
/*
SM_ALLOC));
space_map_close(sm);
- range_tree_walk(msp->ms_freeingtree,
+ range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
/*
msp->ms_id);
while (!svr->svr_thread_exit &&
- range_tree_space(svr->svr_allocd_segs) != 0) {
+ !range_tree_is_empty(svr->svr_allocd_segs)) {
mutex_exit(&svr->svr_lock);
*/
spa_config_exit(spa, SCL_CONFIG, FTAG);
+ /*
+ * This delay will pause the removal around the point
+ * specified by zfs_remove_max_bytes_pause. We do this
+ * solely from the test suite or during debugging.
+ */
+ uint64_t bytes_copied =
+ spa->spa_removing_phys.sr_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ bytes_copied += svr->svr_bytes_done[i];
+ while (zfs_remove_max_bytes_pause <= bytes_copied &&
+ !svr->svr_thread_exit)
+ delay(hz);
+
mutex_enter(&vca.vca_lock);
while (vca.vca_outstanding_bytes >
zfs_remove_max_copy_bytes) {
ASSERT3P(svr->svr_thread, ==, NULL);
spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
- if (vdev_obsolete_counts_are_precise(vd)) {
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
}
- if (vdev_obsolete_sm_object(vd) != 0) {
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
ASSERT(vd->vdev_obsolete_sm != NULL);
- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ ASSERT3U(obsolete_sm_object, ==,
space_map_object(vd->vdev_obsolete_sm));
space_map_free(vd->vdev_obsolete_sm, tx);
* Assert nothing in flight -- ms_*tree is empty.
*/
for (int i = 0; i < TXG_SIZE; i++)
- ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
for (int i = 0; i < TXG_DEFER_SIZE; i++)
- ASSERT0(range_tree_space(msp->ms_defertree[i]));
- ASSERT0(range_tree_space(msp->ms_freedtree));
+ ASSERT0(range_tree_space(msp->ms_defer[i]));
+ ASSERT0(range_tree_space(msp->ms_freed));
if (msp->ms_sm != NULL) {
/*
mutex_enter(&svr->svr_lock);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
- range_tree_walk(msp->ms_freeingtree,
+ range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
/*
uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
- spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+ spa_vdev_remove_cancel_sync, NULL, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
if (error == 0) {
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+ /*
+ * When removing a vdev from an allocation class that has
+ * remaining vdevs, include available space from the class.
+ */
+ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+ uint64_t class_avail = metaslab_class_get_space(mc) -
+ metaslab_class_get_alloc(mc);
+
+ /* add class space, adjusted for overhead */
+ available += (class_avail * 94) / 100;
+ }
+
/*
* There has to be enough free space to remove the
* device and leave double the "slop" space (i.e. we
* must leave at least 3% of the pool free, in addition to
* the normal slop space).
*/
- if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
- NULL, 0, B_TRUE) <
- vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
return (SET_ERROR(ENOSPC));
}
if (!locked)
txg = spa_vdev_enter(spa);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+ if (!locked)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ return (error);
+ }
+
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (spa->spa_spares.sav_vdevs != NULL &&
return (0);
}
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
module_param(zfs_remove_max_segment, int, 0644);
MODULE_PARM_DESC(zfs_remove_max_segment,
"Largest contiguous segment to allocate when removing device");
MODULE_PARM_DESC(vdev_removal_max_span,
"Largest span of free chunks a remap segment can span");
+/* BEGIN CSTYLED */
+module_param(zfs_remove_max_bytes_pause, ulong, 0644);
+MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+ "Pause device removal after this many bytes are copied "
+ "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
EXPORT_SYMBOL(free_from_removing_vdev);
EXPORT_SYMBOL(spa_removal_get_stats);
EXPORT_SYMBOL(spa_remove_init);