#include <sys/vdev_indirect_births.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
#include <sys/trace_vdev.h>
/*
typedef struct vdev_copy_arg {
metaslab_t *vca_msp;
uint64_t vca_outstanding_bytes;
+ uint64_t vca_read_error_bytes;
+ uint64_t vca_write_error_bytes;
kcondvar_t vca_cv;
kmutex_t vca_lock;
} vdev_copy_arg_t;
*/
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+/*
+ * Ignore hard IO errors during device removal. When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled. This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
/*
* Allow a remap segment to span free chunks of at most this size. The main
* impact of a larger span is that we will read and write larger, more
* This is used by the test suite so that it can ensure that certain
* actions happen while in the middle of a removal.
*/
-unsigned long zfs_remove_max_bytes_pause = -1UL;
+int zfs_removal_suspend_progress = 0;
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
static void
spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
&one, tx));
- ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
+ ASSERTV(boolean_t are_precise);
+ ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ ASSERT3B(are_precise, ==, B_TRUE);
}
vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
- if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
+ if (srp->sr_prev_indirect_vdev != -1) {
vdev_t *pvd;
pvd = vdev_lookup_top(spa,
srp->sr_prev_indirect_vdev);
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;
+
+ if (zio->io_error != 0)
+ vca->vca_write_error_bytes += zio->io_size;
+
cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
}
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
+ vdev_copy_arg_t *vca = zio->io_private;
+
+ if (zio->io_error != 0) {
+ mutex_enter(&vca->vca_lock);
+ vca->vca_read_error_bytes += zio->io_size;
+ mutex_exit(&vca->vca_lock);
+ }
+
zio_nowait(zio_unique_parent(zio));
}
{
ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+ /*
+ * If the destination child in unwritable then there is no point
+ * in issuing the source reads which cannot be written.
+ */
+ if (!vdev_writeable(dest_child_vd))
+ return;
+
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes += size;
mutex_exit(&vca->vca_lock);
abd_t *abd = abd_alloc_for_io(size, B_FALSE);
- vdev_t *source_child_vd;
+ vdev_t *source_child_vd = NULL;
if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
/*
* Source and dest are both mirrors. Copy from the same
* child id as we are copying to (wrapping around if there
- * are more dest children than source children).
+ * are more dest children than source children). If the
+ * preferred source child is unreadable select another.
*/
- source_child_vd =
- source_vd->vdev_child[dest_id % source_vd->vdev_children];
+ for (int i = 0; i < source_vd->vdev_children; i++) {
+ source_child_vd = source_vd->vdev_child[
+ (dest_id + i) % source_vd->vdev_children];
+ if (vdev_readable(source_child_vd))
+ break;
+ }
} else {
source_child_vd = source_vd;
}
+ /*
+ * There should always be at least one readable source child or
+ * the pool would be in a suspended state. Somehow selecting an
+ * unreadable child would result in IO errors, the removal process
+ * being cancelled, and the pool reverting to its pre-removal state.
+ */
+ ASSERT3P(source_child_vd, !=, NULL);
+
zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
dest_child_vd, dest_offset, abd, size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
- 0, ZFS_SPACE_CHECK_NONE, tx);
- dmu_tx_commit(tx);
-
- /*
- * Indicate that this thread has exited.
- * After this, we can not use svr.
- */
mutex_enter(&svr->svr_lock);
svr->svr_thread = NULL;
cv_broadcast(&svr->svr_cv);
mutex_exit(&svr->svr_lock);
+
+ /* After this, we can not use svr. */
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
+ 0, ZFS_SPACE_CHECK_NONE, tx);
+ dmu_tx_commit(tx);
}
/*
txg_wait_synced(spa->spa_dsl_pool, 0);
txg = spa_vdev_enter(spa);
vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_DEV);
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
vca.vca_outstanding_bytes = 0;
+ vca.vca_read_error_bytes = 0;
+ vca.vca_write_error_bytes = 0;
mutex_enter(&svr->svr_lock);
/*
* This delay will pause the removal around the point
- * specified by zfs_remove_max_bytes_pause. We do this
+ * specified by zfs_removal_suspend_progress. We do this
* solely from the test suite or during debugging.
*/
uint64_t bytes_copied =
spa->spa_removing_phys.sr_copied;
for (int i = 0; i < TXG_SIZE; i++)
bytes_copied += svr->svr_bytes_done[i];
- while (zfs_remove_max_bytes_pause <= bytes_copied &&
+ while (zfs_removal_suspend_progress &&
!svr->svr_thread_exit)
delay(hz);
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
}
+
+ mutex_enter(&vca.vca_lock);
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ svr->svr_thread_exit = B_TRUE;
+ }
+ mutex_exit(&vca.vca_lock);
}
mutex_exit(&svr->svr_lock);
svr->svr_thread = NULL;
cv_broadcast(&svr->svr_cv);
mutex_exit(&svr->svr_lock);
+
+ /*
+ * During the removal process an unrecoverable read or write
+ * error was encountered. The removal process must be
+ * cancelled or this damage may become permanent.
+ */
+ if (zfs_removal_ignore_errors == 0 &&
+ (vca.vca_read_error_bytes > 0 ||
+ vca.vca_write_error_bytes > 0)) {
+ zfs_dbgmsg("canceling removal due to IO errors: "
+ "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+ vca.vca_read_error_bytes,
+ vca.vca_write_error_bytes);
+ spa_vdev_remove_cancel_impl(spa);
+ }
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
vdev_remove_complete(spa);
ASSERT3P(svr->svr_thread, ==, NULL);
spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
- if (vdev_obsolete_counts_are_precise(vd)) {
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
}
- if (vdev_obsolete_sm_object(vd) != 0) {
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
ASSERT(vd->vdev_obsolete_sm != NULL);
- ASSERT3U(vdev_obsolete_sm_object(vd), ==,
+ ASSERT3U(obsolete_sm_object, ==,
space_map_object(vd->vdev_obsolete_sm));
space_map_free(vd->vdev_obsolete_sm, tx);
vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
}
-int
-spa_vdev_remove_cancel(spa_t *spa)
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
{
- spa_vdev_remove_suspend(spa);
-
- if (spa->spa_vdev_removal == NULL)
- return (ENOTACTIVE);
-
uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
return (error);
}
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+ spa_vdev_remove_suspend(spa);
+
+ if (spa->spa_vdev_removal == NULL)
+ return (ENOTACTIVE);
+
+ return (spa_vdev_remove_cancel_impl(spa));
+}
+
/*
* Called every sync pass of every txg if there's a svr.
*/
vdev_dirty_leaves(vd, VDD_DTL, *txg);
vdev_config_dirty(vd);
- spa_history_log_internal(spa, "vdev remove", NULL,
- "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
- (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+ /* Stop initializing */
+ (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
*txg = spa_vdev_config_enter(spa);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
*/
error = spa_reset_logs(spa);
+ /*
+ * We stop any initializing that is currently in progress but leave
+ * the state as "active". This will allow the initializing to resume
+ * if the removal is canceled sometime later.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
*txg = spa_vdev_config_enter(spa);
/*
if (error != 0) {
metaslab_group_activate(mg);
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
return (error);
}
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
sysevent_t *ev = NULL;
+ char *vd_type = NULL, *vd_path = NULL;
ASSERT(spa_writeable(spa));
ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_AUX);
- char *nvstr = fnvlist_lookup_string(nv,
- ZPOOL_CONFIG_PATH);
- spa_history_log_internal(spa, "vdev remove", NULL,
- "%s vdev (%s) %s", spa_name(spa),
- VDEV_TYPE_SPARE, nvstr);
+ vd_type = VDEV_TYPE_SPARE;
+ vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
(nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
- char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
- spa_history_log_internal(spa, "vdev remove", NULL,
- "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
+ vd_type = VDEV_TYPE_L2CACHE;
+ vd_path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
/*
* Cache devices can always be removed.
*/
spa->spa_l2cache.sav_sync = B_TRUE;
} else if (vd != NULL && vd->vdev_islog) {
ASSERT(!locked);
+ vd_type = "log";
+ vd_path = (vd->vdev_path != NULL) ? vd->vdev_path : "-";
error = spa_vdev_remove_log(vd, &txg);
} else if (vd != NULL) {
ASSERT(!locked);
if (!locked)
error = spa_vdev_exit(spa, NULL, txg, error);
+ /*
+ * Logging must be done outside the spa config lock. Otherwise,
+ * this code path could end up holding the spa config lock while
+ * waiting for a txg_sync so it can write to the internal log.
+ * Doing that would prevent the txg sync from actually happening,
+ * causing a deadlock.
+ */
+ if (error == 0 && vd_type != NULL && vd_path != NULL) {
+ spa_history_log_internal(spa, "vdev remove", NULL,
+ "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
+ }
+
if (ev != NULL)
spa_event_post(ev);
prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
prs->prs_copied = spa->spa_removing_phys.sr_copied;
- if (spa->spa_vdev_removal != NULL) {
- for (int i = 0; i < TXG_SIZE; i++) {
- prs->prs_copied +=
- spa->spa_vdev_removal->svr_bytes_done[i];
- }
- }
-
prs->prs_mapping_memory = 0;
uint64_t indirect_vdev_id =
spa->spa_removing_phys.sr_prev_indirect_vdev;
}
#if defined(_KERNEL)
+module_param(zfs_removal_ignore_errors, int, 0644);
+MODULE_PARM_DESC(zfs_removal_ignore_errors,
+ "Ignore hard IO errors when removing device");
+
module_param(zfs_remove_max_segment, int, 0644);
MODULE_PARM_DESC(zfs_remove_max_segment,
"Largest contiguous segment to allocate when removing device");
"Largest span of free chunks a remap segment can span");
/* BEGIN CSTYLED */
-module_param(zfs_remove_max_bytes_pause, ulong, 0644);
-MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+module_param(zfs_removal_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_removal_suspend_progress,
"Pause device removal after this many bytes are copied "
"(debug use only - causes removal to hang)");
/* END CSTYLED */