/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
typedef struct vdev_copy_arg {
metaslab_t *vca_msp;
uint64_t vca_outstanding_bytes;
+ uint64_t vca_read_error_bytes;
+ uint64_t vca_write_error_bytes;
kcondvar_t vca_cv;
kmutex_t vca_lock;
} vdev_copy_arg_t;
-typedef struct vdev_copy_seg_arg {
- vdev_copy_arg_t *vcsa_copy_arg;
- uint64_t vcsa_txg;
- dva_t *vcsa_dest_dva;
- blkptr_t *vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
/*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal. This determines how much i/o we can have
+ * in flight concurrently.
*/
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
/*
* The largest contiguous segment that we will attempt to allocate when
*/
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+/*
+ * Ignore hard IO errors during device removal. When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled. This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops. The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ * - the mapping will be smaller, since one entry can cover more allocated
+ * segments
+ * - more of the fragmentation in the removing device will be preserved
+ * - we'll do larger allocations, which may fail and fall back on smaller
+ * allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+int zfs_removal_suspend_progress = 0;
+
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
static void
spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
svr->svr_allocd_segs = range_tree_create(NULL, NULL);
- svr->svr_vdev = vd;
+ svr->svr_vdev_id = vd->vdev_id;
for (int i = 0; i < TXG_SIZE; i++) {
svr->svr_frees[i] = range_tree_create(NULL, NULL);
static void
vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
{
- vdev_t *vd = arg;
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
- spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
spa_vdev_removal_t *svr = NULL;
ASSERTV(uint64_t txg = dmu_tx_get_txg(tx));
VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
&one, tx));
- ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+ ASSERTV(boolean_t are_precise);
+ ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ ASSERT3B(are_precise, ==, B_TRUE);
}
vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
* be copied.
*/
spa->spa_removing_phys.sr_to_copy -=
- range_tree_space(ms->ms_freeingtree);
+ range_tree_space(ms->ms_freeing);
- ASSERT0(range_tree_space(ms->ms_freedtree));
+ ASSERT0(range_tree_space(ms->ms_freed));
for (int t = 0; t < TXG_SIZE; t++)
- ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+ ASSERT0(range_tree_space(ms->ms_allocating[t]));
}
/*
ASSERT3P(spa->spa_vdev_removal, ==, NULL);
spa->spa_vdev_removal = svr;
svr->svr_thread = thread_create(NULL, 0,
- spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+ spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
}
/*
spa->spa_removing_phys.sr_state = DSS_NONE;
spa->spa_removing_phys.sr_removing_vdev = -1;
spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+ spa->spa_indirect_vdevs_loaded = B_TRUE;
return (0);
} else if (error != 0) {
return (error);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_t *vd = vdev_lookup_top(spa,
spa->spa_removing_phys.sr_removing_vdev);
- spa_config_exit(spa, SCL_STATE, FTAG);
- if (vd == NULL)
+ if (vd == NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
return (EINVAL);
+ }
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
ASSERT(vdev_is_concrete(vd));
spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
- ASSERT(svr->svr_vdev->vdev_removing);
+ ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+ ASSERT(vd->vdev_removing);
vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
spa->spa_meta_objset, vic->vic_mapping_object);
vd->vdev_indirect_births = vdev_indirect_births_open(
spa->spa_meta_objset, vic->vic_births_object);
+ spa_config_exit(spa, SCL_STATE, FTAG);
spa->spa_vdev_removal = svr;
}
if (!spa_writeable(spa))
return;
- vdev_t *vd = svr->svr_vdev;
- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
- ASSERT3P(vd, !=, NULL);
- ASSERT(vd->vdev_removing);
-
- zfs_dbgmsg("restarting removal of %llu at count=%llu",
- vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
- svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+ zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
0, &p0, TS_RUN, minclsyspri);
}
* and we correctly free already-copied data.
*/
void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
- uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
{
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t txg = spa_syncing_txg(spa);
uint64_t max_offset_yet = 0;
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
vdev_indirect_mapping_object(vim));
- ASSERT3P(vd, ==, svr->svr_vdev);
- ASSERT3U(spa_syncing_txg(spa), ==, txg);
+ ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
mutex_enter(&svr->svr_lock);
* held, so that the remove_thread can not load this metaslab and then
* visit this offset between the time that we metaslab_free_concrete()
* and when we check to see if it has been visited.
+ *
+ * Note: The checkpoint flag is set to false as having/taking
+ * a checkpoint and removing a device can't happen at the same
+ * time.
*/
- metaslab_free_concrete(vd, offset, size, txg);
+ ASSERT(!spa_has_checkpoint(spa));
+ metaslab_free_concrete(vd, offset, size, B_FALSE);
uint64_t synced_size = 0;
uint64_t synced_offset = 0;
* of this free.
*/
if (synced_size > 0) {
- vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
- txg);
+ vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
/*
* Note: this can only be called from syncing context,
* and the vdev_indirect_mapping is only changed from the
* sync thread, so we don't need svr_lock while doing
* metaslab_free_impl_cb.
*/
+ boolean_t checkpoint = B_FALSE;
vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
- metaslab_free_impl_cb, &txg);
+ metaslab_free_impl_cb, &checkpoint);
}
}
if (state == DSS_FINISHED) {
spa_removing_phys_t *srp = &spa->spa_removing_phys;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
- if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+ if (srp->sr_prev_indirect_vdev != -1) {
vdev_t *pvd;
pvd = vdev_lookup_top(spa,
srp->sr_prev_indirect_vdev);
free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
{
vdev_t *vd = arg;
- vdev_indirect_mark_obsolete(vd, offset, size,
- vd->vdev_spa->spa_syncing_txg);
+ vdev_indirect_mark_obsolete(vd, offset, size);
+ boolean_t checkpoint = B_FALSE;
vdev_indirect_ops.vdev_op_remap(vd, offset, size,
- metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+ metaslab_free_impl_cb, &checkpoint);
}
/*
{
spa_vdev_removal_t *svr = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config);
uint64_t txg = dmu_tx_get_txg(tx);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
spa_sync_removing_state(spa, tx);
}
+typedef struct vdev_copy_segment_arg {
+ spa_t *vcsa_spa;
+ dva_t *vcsa_dest_dva;
+ uint64_t vcsa_txg;
+ range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_copy_segment_arg_t *vcsa = arg;
+ spa_t *spa = vcsa->vcsa_spa;
+ blkptr_t bp = { { { {0} } } };
+
+ BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LSIZE(&bp, size);
+ BP_SET_PSIZE(&bp, size);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(&bp, DMU_OT_NONE);
+ BP_SET_LEVEL(&bp, 0);
+ BP_SET_DEDUP(&bp, 0);
+ BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+ DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+ DVA_SET_OFFSET(&bp.blk_dva[0],
+ DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+ DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+ zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+ vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+ range_tree_vacate(vcsa->vcsa_obsolete_segs,
+ unalloc_seg, vcsa);
+ range_tree_destroy(vcsa->vcsa_obsolete_segs);
+ kmem_free(vcsa, sizeof (*vcsa));
+
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
static void
spa_vdev_copy_segment_write_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
- spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+ vdev_copy_arg_t *vca = zio->io_private;
+
abd_free(zio->io_abd);
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;
+
+ if (zio->io_error != 0)
+ vca->vca_write_error_bytes += zio->io_size;
+
cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
-
- ASSERT0(zio->io_error);
- kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
- kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
}
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- dva_t *dest_dva = vcsa->vcsa_dest_dva;
- uint64_t txg = vcsa->vcsa_txg;
- spa_t *spa = zio->io_spa;
- ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)));
- blkptr_t *bp = NULL;
- dva_t *dva = NULL;
- uint64_t size = zio->io_size;
-
- ASSERT3P(dest_vd, !=, NULL);
- ASSERT0(zio->io_error);
-
- vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- bp = vcsa->vcsa_dest_bp;
- dva = bp->blk_dva;
-
- BP_ZERO(bp);
-
- /* initialize with dest_dva */
- bcopy(dest_dva, dva, sizeof (dva_t));
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
- txg, bp, zio->io_abd, size,
- spa_vdev_copy_segment_write_done, vcsa,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ vdev_copy_arg_t *vca = zio->io_private;
+
+ if (zio->io_error != 0) {
+ mutex_enter(&vca->vca_lock);
+ vca->vca_read_error_bytes += zio->io_size;
+ mutex_exit(&vca->vca_lock);
+ }
+
+ zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible. Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs. However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads. If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ * null
+ * / \
+ * write(new vdev, child 0) write(new vdev, child 1)
+ * | |
+ * read(old vdev, child 0) read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete. However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete. In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*. We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+ vdev_t *source_vd, uint64_t source_offset,
+ vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+ ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+ /*
+ * If the destination child in unwritable then there is no point
+ * in issuing the source reads which cannot be written.
+ */
+ if (!vdev_writeable(dest_child_vd))
+ return;
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+ vdev_t *source_child_vd = NULL;
+ if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+ /*
+ * Source and dest are both mirrors. Copy from the same
+ * child id as we are copying to (wrapping around if there
+ * are more dest children than source children). If the
+ * preferred source child is unreadable select another.
+ */
+ for (int i = 0; i < source_vd->vdev_children; i++) {
+ source_child_vd = source_vd->vdev_child[
+ (dest_id + i) % source_vd->vdev_children];
+ if (vdev_readable(source_child_vd))
+ break;
+ }
+ } else {
+ source_child_vd = source_vd;
+ }
+
+ /*
+ * There should always be at least one readable source child or
+ * the pool would be in a suspended state. Somehow selecting an
+ * unreadable child would result in IO errors, the removal process
+ * being cancelled, and the pool reverting to its pre-removal state.
+ */
+ ASSERT3P(source_child_vd, !=, NULL);
+
+ zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+ dest_child_vd, dest_offset, abd, size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_write_done, vca);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ source_child_vd, source_offset, abd, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_read_done, vca));
}
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+ uint64_t maxalloc, uint64_t txg,
vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
{
metaslab_group_t *mg = vd->vdev_mg;
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_entry_t *entry;
- vdev_copy_seg_arg_t *private;
dva_t dst = {{ 0 }};
- blkptr_t blk, *bp = &blk;
- dva_t *dva = bp->blk_dva;
+ uint64_t start = range_tree_min(segs);
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
- int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal);
+ uint64_t size = range_tree_span(segs);
+ if (range_tree_span(segs) > maxalloc) {
+ /*
+ * We can't allocate all the segments. Prefer to end
+ * the allocation at the end of a segment, thus avoiding
+ * additional split blocks.
+ */
+ range_seg_t search;
+ avl_index_t where;
+ search.rs_start = start + maxalloc;
+ search.rs_end = search.rs_start;
+ range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+ if (rs == NULL) {
+ rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+ } else {
+ rs = AVL_PREV(&segs->rt_root, rs);
+ }
+ if (rs != NULL) {
+ size = rs->rs_end - start;
+ } else {
+ /*
+ * There are no segments that end before maxalloc.
+ * I.e. the first segment is larger than maxalloc,
+ * so we must split it.
+ */
+ size = maxalloc;
+ }
+ }
+ ASSERT3U(size, <=, maxalloc);
+
+ /*
+ * An allocation class might not have any remaining vdevs or space
+ */
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
if (error != 0)
return (error);
+ /*
+ * Determine the ranges that are not actually needed. Offsets are
+ * relative to the start of the range to be copied (i.e. relative to the
+ * local variable "start").
+ */
+ range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+ range_seg_t *rs = avl_first(&segs->rt_root);
+ ASSERT3U(rs->rs_start, ==, start);
+ uint64_t prev_seg_end = rs->rs_end;
+ while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+ if (rs->rs_start >= start + size) {
+ break;
+ } else {
+ range_tree_add(obsolete_segs,
+ prev_seg_end - start,
+ rs->rs_start - prev_seg_end);
+ }
+ prev_seg_end = rs->rs_end;
+ }
+ /* We don't end in the middle of an obsolete range */
+ ASSERT3U(start + size, <=, prev_seg_end);
+
+ range_tree_clear(segs, start, size);
+
/*
* We can't have any padding of the allocated size, otherwise we will
* misunderstand what's allocated, and the size of the mapping.
*/
ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
- mutex_enter(&vca->vca_lock);
- vca->vca_outstanding_bytes += size;
- mutex_exit(&vca->vca_lock);
-
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
entry->vime_mapping.vimep_dst = dst;
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+ }
- private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
- private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
- private->vcsa_txg = txg;
- private->vcsa_copy_arg = vca;
-
- /*
- * This lock is eventually released by the donefunc for the
- * zio_write_phys that finishes copying the data.
- */
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+ vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+ vcsa->vcsa_obsolete_segs = obsolete_segs;
+ vcsa->vcsa_spa = spa;
+ vcsa->vcsa_txg = txg;
/*
- * Do logical I/O, letting the redundancy vdevs (like mirror)
- * handle their own I/O instead of duplicating that code here.
+ * See comment before spa_vdev_copy_one_child().
*/
- BP_ZERO(bp);
-
- DVA_SET_VDEV(&dva[0], vd->vdev_id);
- DVA_SET_OFFSET(&dva[0], start);
- DVA_SET_GANG(&dva[0], 0);
- DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
- bp, abd_alloc_for_io(size, B_FALSE), size,
- spa_vdev_copy_segment_read_done, private,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+ spa_vdev_copy_segment_done, vcsa, 0);
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+ if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < dest_vd->vdev_children; i++) {
+ vdev_t *child = dest_vd->vdev_child[i];
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ child, DVA_GET_OFFSET(&dst), i, size);
+ }
+ } else {
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+ }
+ zio_nowait(nzio);
list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
- vdev_t *vd = svr->svr_vdev;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
"%s vdev %llu", spa_name(spa), vd->vdev_id);
}
-static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
- ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
- ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
- ASSERT(vd->vdev_indirect_mapping != NULL);
- ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
- vd->vdev_indirect_mapping = NULL;
-
- ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
- ASSERT(vd->vdev_indirect_births != NULL);
- ivd->vdev_indirect_births = vd->vdev_indirect_births;
- vd->vdev_indirect_births = NULL;
-
- ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
- ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
- if (vd->vdev_obsolete_sm != NULL) {
- ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
- /*
- * We cannot use space_map_{open,close} because we hold all
- * the config locks as writer.
- */
- ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
- ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
- vd->vdev_obsolete_sm = NULL;
- }
-}
-
static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+ ivd->vdev_removing = 0;
vd->vdev_leaf_zap = 0;
vdev_remove_child(ivd, vd);
vdev_compact_children(ivd);
- vdev_indirect_state_transfer(ivd, vd);
-
- svr->svr_vdev = ivd;
-
- ASSERT(!ivd->vdev_removing);
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
- 0, ZFS_SPACE_CHECK_NONE, tx);
- dmu_tx_commit(tx);
-
- /*
- * Indicate that this thread has exited.
- * After this, we can not use svr.
- */
mutex_enter(&svr->svr_lock);
svr->svr_thread = NULL;
cv_broadcast(&svr->svr_cv);
mutex_exit(&svr->svr_lock);
+
+ /* After this, we can not use svr. */
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+ 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
}
/*
* context by the removal thread after we have copied all vdev's data.
*/
static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
{
- spa_t *spa = vd->vdev_spa;
uint64_t txg;
/*
* vdev_metaslab_fini()
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
-
txg = spa_vdev_enter(spa);
+ vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
vd->vdev_id, txg);
/*
* We now release the locks, allowing spa_sync to run and finish the
* removal via vdev_remove_complete_sync in syncing context.
+ *
+ * Note that we hold on to the vdev_t that has been replaced. Since
+ * it isn't part of the vdev tree any longer, it can't be concurrently
+ * manipulated, even while we don't have the config lock.
*/
(void) spa_vdev_exit(spa, NULL, txg, 0);
*/
vdev_config_dirty(spa->spa_root_vdev);
(void) spa_vdev_exit(spa, vd, txg, 0);
+
+ if (ev != NULL)
+ spa_event_post(ev);
}
/*
* this size again this txg.
*/
static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
uint64_t *max_alloc, dmu_tx_t *tx)
{
uint64_t txg = dmu_tx_get_txg(tx);
mutex_enter(&svr->svr_lock);
- range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
- if (rs == NULL) {
+ /*
+ * Determine how big of a chunk to copy. We can allocate up
+ * to max_alloc bytes, and we can span up to vdev_removal_max_span
+ * bytes of unallocated space at a time. "segs" will track the
+ * allocated segments that we are copying. We may also be copying
+ * free segments (of up to vdev_removal_max_span bytes).
+ */
+ range_tree_t *segs = range_tree_create(NULL, NULL);
+ for (;;) {
+ range_seg_t *rs = range_tree_first(svr->svr_allocd_segs);
+
+ if (rs == NULL)
+ break;
+
+ uint64_t seg_length;
+
+ if (range_tree_is_empty(segs)) {
+ /* need to truncate the first seg based on max_alloc */
+ seg_length =
+ MIN(rs->rs_end - rs->rs_start, *max_alloc);
+ } else {
+ if (rs->rs_start - range_tree_max(segs) >
+ vdev_removal_max_span) {
+ /*
+ * Including this segment would cause us to
+ * copy a larger unneeded chunk than is allowed.
+ */
+ break;
+ } else if (rs->rs_end - range_tree_min(segs) >
+ *max_alloc) {
+ /*
+ * This additional segment would extend past
+ * max_alloc. Rather than splitting this
+ * segment, leave it for the next mapping.
+ */
+ break;
+ } else {
+ seg_length = rs->rs_end - rs->rs_start;
+ }
+ }
+
+ range_tree_add(segs, rs->rs_start, seg_length);
+ range_tree_remove(svr->svr_allocd_segs,
+ rs->rs_start, seg_length);
+ }
+
+ if (range_tree_is_empty(segs)) {
mutex_exit(&svr->svr_lock);
+ range_tree_destroy(segs);
return;
}
- uint64_t offset = rs->rs_start;
- uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
-
- range_tree_remove(svr->svr_allocd_segs, offset, length);
if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
svr, 0, ZFS_SPACE_CHECK_NONE, tx);
}
- svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
/*
* Note: this is the amount of *allocated* space
* that we are taking care of each txg.
*/
- svr->svr_bytes_done[txg & TXG_MASK] += length;
+ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
mutex_exit(&svr->svr_lock);
zio_alloc_list_t zal;
metaslab_trace_init(&zal);
- uint64_t thismax = *max_alloc;
- while (length > 0) {
- uint64_t mylen = MIN(length, thismax);
-
- int error = spa_vdev_copy_segment(svr->svr_vdev,
- offset, mylen, txg, vca, &zal);
+ uint64_t thismax = SPA_MAXBLOCKSIZE;
+ while (!range_tree_is_empty(segs)) {
+ int error = spa_vdev_copy_segment(vd,
+ segs, thismax, txg, vca, &zal);
if (error == ENOSPC) {
/*
*/
ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
- thismax = P2ROUNDUP(mylen / 2,
+ uint64_t attempted =
+ MIN(range_tree_span(segs), thismax);
+ thismax = P2ROUNDUP(attempted / 2,
1 << spa->spa_max_ashift);
- ASSERT3U(thismax, <, mylen);
/*
* The minimum-size allocation can not fail.
*/
- ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
- *max_alloc = mylen - (1 << spa->spa_max_ashift);
+ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+ *max_alloc = attempted - (1 << spa->spa_max_ashift);
} else {
ASSERT0(error);
- length -= mylen;
- offset += mylen;
/*
* We've performed an allocation, so reset the
}
}
metaslab_trace_fini(&zal);
+ range_tree_destroy(segs);
}
/*
static void
spa_vdev_remove_thread(void *arg)
{
- vdev_t *vd = arg;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = arg;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_copy_arg_t vca;
uint64_t max_alloc = zfs_remove_max_segment;
uint64_t last_txg = 0;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
ASSERT(vdev_is_concrete(vd));
ASSERT(vd->vdev_removing);
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
- ASSERT3P(svr->svr_vdev, ==, vd);
ASSERT(vim != NULL);
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
vca.vca_outstanding_bytes = 0;
+ vca.vca_read_error_bytes = 0;
+ vca.vca_write_error_bytes = 0;
mutex_enter(&svr->svr_lock);
* Assert nothing in flight -- ms_*tree is empty.
*/
for (int i = 0; i < TXG_SIZE; i++) {
- ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
}
/*
SM_ALLOC));
space_map_close(sm);
- range_tree_walk(msp->ms_freeingtree,
+ range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
/*
msp->ms_id);
while (!svr->svr_thread_exit &&
- range_tree_space(svr->svr_allocd_segs) != 0) {
+ !range_tree_is_empty(svr->svr_allocd_segs)) {
mutex_exit(&svr->svr_lock);
+ /*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * This delay will pause the removal around the point
+ * specified by zfs_removal_suspend_progress. We do this
+ * solely from the test suite or during debugging.
+ */
+ uint64_t bytes_copied =
+ spa->spa_removing_phys.sr_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ bytes_copied += svr->svr_bytes_done[i];
+ while (zfs_removal_suspend_progress &&
+ !svr->svr_thread_exit)
+ delay(hz);
+
mutex_enter(&vca.vca_lock);
while (vca.vca_outstanding_bytes >
zfs_remove_max_copy_bytes) {
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
uint64_t txg = dmu_tx_get_txg(tx);
+ /*
+ * Reacquire the vdev_config lock. The vdev_t
+ * that we're removing may have changed, e.g. due
+ * to a vdev_attach or vdev_detach.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
if (txg != last_txg)
max_alloc = zfs_remove_max_segment;
last_txg = txg;
- spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+ spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
}
+
+ mutex_enter(&vca.vca_lock);
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ svr->svr_thread_exit = B_TRUE;
+ }
+ mutex_exit(&vca.vca_lock);
}
mutex_exit(&svr->svr_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
/*
* Wait for all copies to finish before cleaning up the vca.
*/
svr->svr_thread = NULL;
cv_broadcast(&svr->svr_cv);
mutex_exit(&svr->svr_lock);
+
+ /*
+ * During the removal process an unrecoverable read or write
+ * error was encountered. The removal process must be
+ * cancelled or this damage may become permanent.
+ */
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ zfs_dbgmsg("canceling removal due to IO errors: "
+ "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+ vca.vca_read_error_bytes,
+ vca.vca_write_error_bytes);
+ spa_vdev_remove_cancel_impl(spa);
+ }
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
- vdev_remove_complete(vd);
+ vdev_remove_complete(spa);
}
}
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
objset_t *mos = spa->spa_meta_objset;
ASSERT3P(svr->svr_thread, ==, NULL);
spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
- if (vdev_obsolete_counts_are_precise(vd)) {
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
}
- if (vdev_obsolete_sm_object(vd) != 0) {
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
ASSERT(vd->vdev_obsolete_sm != NULL);
- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ ASSERT3U(obsolete_sm_object, ==,
space_map_object(vd->vdev_obsolete_sm));
space_map_free(vd->vdev_obsolete_sm, tx);
* Assert nothing in flight -- ms_*tree is empty.
*/
for (int i = 0; i < TXG_SIZE; i++)
- ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+ ASSERT0(range_tree_space(msp->ms_allocating[i]));
for (int i = 0; i < TXG_DEFER_SIZE; i++)
- ASSERT0(range_tree_space(msp->ms_defertree[i]));
- ASSERT0(range_tree_space(msp->ms_freedtree));
+ ASSERT0(range_tree_space(msp->ms_defer[i]));
+ ASSERT0(range_tree_space(msp->ms_freed));
if (msp->ms_sm != NULL) {
/*
mutex_enter(&svr->svr_lock);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
- range_tree_walk(msp->ms_freeingtree,
+ range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
/*
* because we have not allocated mappings for it yet.
*/
uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
- range_tree_clear(svr->svr_allocd_segs, syncd,
- msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > syncd)
+ range_tree_clear(svr->svr_allocd_segs,
+ syncd, sm_end - syncd);
mutex_exit(&svr->svr_lock);
}
vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
}
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
{
- spa_vdev_remove_suspend(spa);
-
- if (spa->spa_vdev_removal == NULL)
- return (ENOTACTIVE);
-
- uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
- spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+ spa_vdev_remove_cancel_sync, NULL, 0,
+ ZFS_SPACE_CHECK_EXTRA_RESERVED);
if (error == 0) {
spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
return (error);
}
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+ spa_vdev_remove_suspend(spa);
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ENOTACTIVE);
+
+ return (spa_vdev_remove_cancel_impl(spa));
+}
+
/*
* Called every sync pass of every txg if there's a svr.
*/
vdev_dirty_leaves(vd, VDD_DTL, *txg);
vdev_config_dirty(vd);
- spa_history_log_internal(spa, "vdev remove", NULL,
- "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
- (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
*txg = spa_vdev_config_enter(spa);
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+ /*
+ * When removing a vdev from an allocation class that has
+ * remaining vdevs, include available space from the class.
+ */
+ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+ uint64_t class_avail = metaslab_class_get_space(mc) -
+ metaslab_class_get_alloc(mc);
+
+ /* add class space, adjusted for overhead */
+ available += (class_avail * 94) / 100;
+ }
+
/*
* There has to be enough free space to remove the
* device and leave double the "slop" space (i.e. we
* must leave at least 3% of the pool free, in addition to
* the normal slop space).
*/
- if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
- NULL, 0, B_TRUE) <
- vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
return (SET_ERROR(ENOSPC));
}
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
dsl_sync_task_nowait(spa->spa_dsl_pool,
vdev_remove_initiate_sync,
- vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
dmu_tx_commit(tx);
return (0);
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
sysevent_t *ev = NULL;
+ char *vd_type = NULL, *vd_path = NULL;
ASSERT(spa_writeable(spa));
if (!locked)
txg = spa_vdev_enter(spa);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+ error = (spa_has_checkpoint(spa)) ?
+ ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+ if (!locked)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ return (error);
+ }
+
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (spa->spa_spares.sav_vdevs != NULL &&
ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_AUX);
- char *nvstr = fnvlist_lookup_string(nv,
- ZPOOL_CONFIG_PATH);
- spa_history_log_internal(spa, "vdev remove", NULL,
- "%s vdev (%s) %s", spa_name(spa),
- VDEV_TYPE_SPARE, nvstr);
+ vd_type = VDEV_TYPE_SPARE;
+ vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
(nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
- char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
- spa_history_log_internal(spa, "vdev remove", NULL,
- "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
+ vd_type = VDEV_TYPE_L2CACHE;
+ vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
/*
* Cache devices can always be removed.
*/
spa->spa_l2cache.sav_sync = B_TRUE;
} else if (vd != NULL && vd->vdev_islog) {
ASSERT(!locked);
+ vd_type = "log";
+ vd_path = (vd->vdev_path != NULL) ? vd->vdev_path : "-";
error = spa_vdev_remove_log(vd, &txg);
} else if (vd != NULL) {
ASSERT(!locked);
if (!locked)
error = spa_vdev_exit(spa, NULL, txg, error);
+ /*
+ * Logging must be done outside the spa config lock. Otherwise,
+ * this code path could end up holding the spa config lock while
+ * waiting for a txg_sync so it can write to the internal log.
+ * Doing that would prevent the txg sync from actually happening,
+ * causing a deadlock.
+ */
+ if (error == 0 && vd_type != NULL && vd_path != NULL) {
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
+ }
+
if (ev != NULL)
spa_event_post(ev);
prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
prs->prs_copied = spa->spa_removing_phys.sr_copied;
- if (spa->spa_vdev_removal != NULL) {
- for (int i = 0; i < TXG_SIZE; i++) {
- prs->prs_copied +=
- spa->spa_vdev_removal->svr_bytes_done[i];
- }
- }
-
prs->prs_mapping_memory = 0;
uint64_t indirect_vdev_id =
spa->spa_removing_phys.sr_prev_indirect_vdev;
return (0);
}
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+ "Ignore hard IO errors when removing device");
+
module_param(zfs_remove_max_segment, int, 0644);
MODULE_PARM_DESC(zfs_remove_max_segment,
"Largest contiguous segment to allocate when removing device");
+module_param(vdev_removal_max_span, int, 0644);
+MODULE_PARM_DESC(vdev_removal_max_span,
+ "Largest span of free chunks a remap segment can span");
+
+/* BEGIN CSTYLED */
+module_param(zfs_removal_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_removal_suspend_progress,
+ "Pause device removal after this many bytes are copied "
+ "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
EXPORT_SYMBOL(free_from_removing_vdev);
EXPORT_SYMBOL(spa_removal_get_stats);
EXPORT_SYMBOL(spa_remove_init);