#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
int vdev_validate_skip = B_FALSE;
int vdev_dtl_sm_blksz = (1 << 12);
/*
- * Rate limit delay events to this many IO delays per second.
+ * Rate limit slow IO (delay) events to this many per second.
*/
-unsigned int zfs_delays_per_second = 20;
+unsigned int zfs_slow_io_events_per_second = 20;
/*
* Rate limit checksum events after this many checksum errors per second.
*/
-unsigned int zfs_checksums_per_second = 20;
+unsigned int zfs_checksum_events_per_second = 20;
/*
* Ignore errors during scrub/resilver. Allows to work around resilver
*/
int vdev_standard_sm_blksz = (1 << 17);
+/*
+ * Tunable parameter for debugging or performance analysis. Setting this
+ * will cause pool corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+int zfs_nocacheflush = 0;
+
/*PRINTFLIKE2*/
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
return (ops);
}
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+ res->rs_start = in->rs_start;
+ res->rs_end = in->rs_end;
+}
+
/*
* Derive the enumerated alloction bias from string input.
* String origin is either the per-vdev zap or zpool(1M).
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
+ cvd->vdev_spa->spa_leaf_list_gen++;
+ }
}
void
pvd->vdev_children = 0;
}
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ spa_t *spa = cvd->vdev_spa;
+ list_remove(&spa->spa_leaf_list, cvd);
+ spa->spa_leaf_list_gen++;
+ }
+
/*
* Walk up all ancestors to update guid sum.
*/
* and checksum events so that we don't overwhelm ZED with thousands
* of events when a disk is acting up.
*/
- zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1);
- zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1);
+ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
+ 1);
+ zfs_ratelimit_init(&vd->vdev_checksum_rl,
+ &zfs_checksum_events_per_second, 1);
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
+ list_link_init(&vd->vdev_initialize_node);
+ list_link_init(&vd->vdev_leaf_node);
+ list_link_init(&vd->vdev_trim_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
+ if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+
/*
* In general, when importing a pool we want to ignore the
* persistent fault state, as the diagnosis made on another
{
spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+
/*
* Scan queues are normally destroyed at the end of a scan. If the
* queue exists here, that implies the vdev is being removed while
vdev_remove_child(vd->vdev_parent, vd);
ASSERT(vd->vdev_parent == NULL);
+ ASSERT(!list_link_active(&vd->vdev_leaf_node));
/*
* Clean up vdev structure.
rw_destroy(&vd->vdev_indirect_rwlock);
mutex_destroy(&vd->vdev_obsolete_lock);
- mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
+ mutex_destroy(&vd->vdev_trim_lock);
+ mutex_destroy(&vd->vdev_autotrim_lock);
+ mutex_destroy(&vd->vdev_trim_io_lock);
+ cv_destroy(&vd->vdev_trim_cv);
+ cv_destroy(&vd->vdev_autotrim_cv);
+ cv_destroy(&vd->vdev_trim_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
}
if (vd->vdev_ms != NULL) {
- uint64_t count = vd->vdev_ms_count;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
- metaslab_group_passivate(vd->vdev_mg);
+ uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
-
if (msp != NULL)
metaslab_fini(msp);
}
vd->vdev_ms = NULL;
vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
+ /*
+ * Physical volume size should never be larger than its max size, unless
+ * the disk has shrunk while we were reading it or the device is buggy
+ * or damaged: either way it's not safe for use, bail out of the open.
+ */
+ if (osize > max_osize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_OPEN_FAILED);
+ return (SET_ERROR(ENXIO));
+ }
+
/*
* Reset the vdev_reopening flag so that we actually close
* the vdev on error.
* since this would just restart the scrub we are already doing.
*/
if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
- vdev_resilver_needed(vd, NULL, NULL))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ vdev_resilver_needed(vd, NULL, NULL)) {
+ if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+ else
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
return (0);
}
vdev_metaslab_set_size(vdev_t *vd)
{
uint64_t asize = vd->vdev_asize;
- uint64_t ms_count = asize >> vdev_default_ms_shift;
+ uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift;
/*
* There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev.
- * In general, we aim for vdev_max_ms_count (200) metaslabs. The
- * range of the dimensions are as follows:
*
- * 2^29 <= ms_size <= 2^38
+ * The default values used below are a good balance between memory
+ * usage (larger metaslab size means more memory needed for loaded
+ * metaslabs; more metaslabs means more memory needed for the
+ * metaslab_t structs), metaslab load time (larger metaslabs take
+ * longer to load), and metaslab sync time (more metaslabs means
+ * more time spent syncing all of them).
+ *
+ * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+ * The range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072
*
* On the lower end of vdev sizes, we aim for metaslabs sizes of
* of at least 16 metaslabs will override this minimum size goal.
*
* On the upper end of vdev sizes, we aim for a maximum metaslab
- * size of 256GB. However, we will cap the total count to 2^17
- * metaslabs to keep our memory footprint in check.
+ * size of 16GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check and let the
+ * metaslab size grow from there if that limit is hit.
*
* The net effect of applying above constrains is summarized below.
*
- * vdev size metaslab count
- * -------------|-----------------
- * < 8GB ~16
- * 8GB - 100GB one per 512MB
- * 100GB - 50TB ~200
- * 50TB - 32PB one per 256GB
- * > 32PB ~131,072
- * -------------------------------
+ * vdev size metaslab count
+ * --------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
+ * --------------------------------
+ *
+ * Finally, note that all of the above calculate the initial
+ * number of metaslabs. Expanding a top-level vdev will result
+ * in additional metaslabs being allocated making it possible
+ * to exceed the zfs_vdev_ms_count_limit.
*/
- if (ms_count < vdev_min_ms_count)
- ms_shift = highbit64(asize / vdev_min_ms_count);
- else if (ms_count > vdev_max_ms_count)
- ms_shift = highbit64(asize / vdev_max_ms_count);
+ if (ms_count < zfs_vdev_min_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+ else if (ms_count > zfs_vdev_default_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else
- ms_shift = vdev_default_ms_shift;
+ ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT;
- } else if (ms_shift > vdev_max_ms_shift) {
- ms_shift = vdev_max_ms_shift;
+ } else if (ms_shift > zfs_vdev_max_ms_shift) {
+ ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */
- if ((asize >> ms_shift) > vdev_ms_count_limit)
- ms_shift = highbit64(asize / vdev_ms_count_limit);
+ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+ ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
}
vd->vdev_ms_shift = ms_shift;
if (vd->vdev_state < VDEV_STATE_DEGRADED)
return (B_FALSE);
+ if (vd->vdev_resilver_deferred)
+ return (B_FALSE);
+
if (vd->vdev_resilver_txg == 0 ||
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
}
/*
- * Reassess DTLs after a config change or scrub completion.
+ * Reassess DTLs after a config change or scrub completion. If txg == 0 no
+ * write operations will be issued to the pool.
*/
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
* DTLs then reset its resilvering flag and dirty
* the top level so that we persist the change.
*/
- if (vd->vdev_resilver_txg != 0 &&
+ if (txg != 0 && vd->vdev_resilver_txg != 0 &&
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
vd->vdev_resilver_txg = 0;
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
-
- /*
- * Now that we've opened the space_map we need to update
- * the in-core DTL.
- */
- space_map_update(vd->vdev_dtl_sm);
-
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
}
dmu_tx_commit(tx);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_update(vd->vdev_dtl_sm);
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
}
/*
- * Gets the checkpoint space map object from the vdev's ZAP.
- * Returns the spacemap object, or 0 if it wasn't in the ZAP
- * or the ZAP doesn't exist yet.
+ * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj
+ * will contain either the checkpoint spacemap object or zero if none exists.
+ * All other errors are returned to the caller.
*/
int
-vdev_checkpoint_sm_object(vdev_t *vd)
+vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
{
ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
if (vd->vdev_top_zap == 0) {
+ *sm_obj = 0;
return (0);
}
- uint64_t sm_obj = 0;
- int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
- VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
-
- if (err != 0)
- VERIFY3S(err, ==, ENOENT);
+ int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
+ if (error == ENOENT) {
+ *sm_obj = 0;
+ error = 0;
+ }
- return (sm_obj);
+ return (error);
}
int
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
(u_longlong_t)vd->vdev_asize);
return (SET_ERROR(ENXIO));
- } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
"[error=%d]", error);
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
return (error);
}
- uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
- if (checkpoint_sm_obj != 0) {
+ uint64_t checkpoint_sm_obj;
+ error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
+ if (error == 0 && checkpoint_sm_obj != 0) {
objset_t *mos = spa_meta_objset(vd->vdev_spa);
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
- if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+ error = space_map_open(&vd->vdev_checkpoint_sm,
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
- vd->vdev_ashift))) {
+ vd->vdev_ashift);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: space_map_open "
"failed for checkpoint spacemap (obj %llu) "
"[error=%d]",
return (error);
}
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
- space_map_update(vd->vdev_checkpoint_sm);
/*
* Since the checkpoint_sm contains free entries
- * exclusively we can use sm_alloc to indicate the
- * culmulative checkpointed space that has been freed.
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
*/
vd->vdev_stat.vs_checkpoint_space =
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
vd->vdev_stat.vs_checkpoint_space;
+ } else if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
+ "checkpoint space map object from vdev ZAP "
+ "[error=%d]", error);
+ return (error);
}
}
return (error);
}
- uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
- if (obsolete_sm_object != 0) {
+ uint64_t obsolete_sm_object;
+ error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
+ if (error == 0 && obsolete_sm_object != 0) {
objset_t *mos = vd->vdev_spa->spa_meta_objset;
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
(u_longlong_t)obsolete_sm_object, error);
return (error);
}
- space_map_update(vd->vdev_obsolete_sm);
+ } else if (error != 0) {
+ vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
+ "space map object from vdev ZAP [error=%d]", error);
+ return (error);
}
return (0);
}
static void
-vdev_remove_empty(vdev_t *vd, uint64_t txg)
+vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
- dmu_tx_t *tx;
+ ASSERT(vd->vdev_islog);
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
- if (vd->vdev_ms != NULL) {
- metaslab_group_t *mg = vd->vdev_mg;
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
- if (msp == NULL || msp->ms_sm == NULL)
- continue;
-
- mutex_enter(&msp->ms_lock);
- /*
- * If the metaslab was not loaded when the vdev
- * was removed then the histogram accounting may
- * not be accurate. Update the histogram information
- * here so that we ensure that the metaslab group
- * and metaslab class are up-to-date.
- */
- metaslab_group_histogram_remove(mg, msp);
-
- VERIFY0(space_map_allocated(msp->ms_sm));
- space_map_close(msp->ms_sm);
- msp->ms_sm = NULL;
- mutex_exit(&msp->ms_lock);
- }
-
- if (vd->vdev_checkpoint_sm != NULL) {
- ASSERT(spa_has_checkpoint(spa));
- space_map_close(vd->vdev_checkpoint_sm);
- vd->vdev_checkpoint_sm = NULL;
- }
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
- ASSERT0(mg->mg_histogram[i]);
- }
-
- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
vdev_destroy_spacemaps(vd, tx);
-
- if (vd->vdev_islog && vd->vdev_top_zap != 0) {
+ if (vd->vdev_top_zap != 0) {
vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
vd->vdev_top_zap = 0;
}
+
dmu_tx_commit(tx);
}
ASSERT(vdev_is_concrete(vd));
- while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))))
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
metaslab_sync_done(msp, txg);
if (reassess)
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
- dmu_tx_t *tx;
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
- dmu_tx_t *tx;
-
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vdev_indirect_sync_obsolete(vd, tx);
- dmu_tx_commit(tx);
/*
* If the vdev is indirect, it can't have dirty
if (vd->vdev_ops == &vdev_indirect_ops) {
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
return;
}
}
!vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
- dmu_tx_commit(tx);
}
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
vdev_dtl_sync(lvd, txg);
/*
- * Remove the metadata associated with this vdev once it's empty.
- * Note that this is typically used for log/cache device removal;
- * we don't empty toplevel vdevs when removing them. But if
- * a toplevel happens to be emptied, this is not harmful.
+ * If this is an empty log device being removed, destroy the
+ * metadata associated with it.
*/
- if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) {
- vdev_remove_empty(vd, txg);
- }
+ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove_empty_log(vd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
}
uint64_t
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
spa->spa_autoexpand);
+ vd->vdev_expansion_time = gethrestime_sec();
}
vdev_reopen(tvd);
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ /* Restart trimming if necessary */
+ mutex_enter(&vd->vdev_trim_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_trim_thread == NULL &&
+ vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
+ (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ mutex_exit(&vd->vdev_trim_lock);
+
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
*/
if (error == 0 &&
tvd->vdev_checkpoint_sm != NULL) {
- ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
- !=, 0);
+ ASSERT3U(space_map_allocated(
+ tvd->vdev_checkpoint_sm), !=, 0);
error = ZFS_ERR_CHECKPOINT_EXISTS;
}
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_stat.vs_slow_ios = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
- if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) {
+ if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ spa_feature_is_enabled(spa,
+ SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+ else
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
}
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
- int t;
- for (t = 0; t < ZIO_TYPES; t++) {
+ for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
boolean_t
vdev_is_spacemap_addressable(vdev_t *vd)
{
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+ return (B_TRUE);
+
/*
- * Assuming 47 bits of the space map entry dedicated for the entry's
- * offset (see description in space_map.h), we calculate the maximum
- * address that can be described by a space map entry for the given
- * device.
+ * If double-word space map entries are not enabled we assume
+ * 47 bits of the space map entry are dedicated to the entry's
+ * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+ * to calculate the maximum address that can be described by a
+ * space map entry for the given device.
*/
- uint64_t shift = vd->vdev_ashift + 47;
+ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
if (shift >= 63) /* detect potential overflow */
return (B_TRUE);
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
+ /*
+ * Report initializing progress. Since we don't
+ * have the initializing locks held, this is only
+ * an estimate (although a fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done =
+ vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est =
+ vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time =
+ vd->vdev_initialize_action_time;
+
+ /*
+ * Report manual TRIM progress. Since we don't have
+ * the manual TRIM locks held, this is only an
+ * estimate (although fairly accurate one).
+ */
+ vs->vs_trim_notsup = !vd->vdev_has_trim;
+ vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+ vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+ vs->vs_trim_state = vd->vdev_trim_state;
+ vs->vs_trim_action_time = vd->vdev_trim_action_time;
+ }
/*
- * Report expandable space on top-level, non-auxillary devices
+ * Report expandable space on top-level, non-auxiliary devices
* only. The expandable space is reported in terms of metaslab
* sized units since that determines how much space the pool
* can expand.
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
- ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0);
vdev_get_stats_ex_impl(vd, vs, vsx);
mutex_exit(&vd->vdev_stat_lock);
}
*/
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+ zio_type_t vs_type = type;
+
+ /*
+ * TRIM ops and bytes are reported to user space as
+ * ZIO_TYPE_IOCTL. This is done to preserve the
+ * vdev_stat_t structure layout for user space.
+ */
+ if (type == ZIO_TYPE_TRIM)
+ vs_type = ZIO_TYPE_IOCTL;
- vs->vs_ops[type]++;
- vs->vs_bytes[type] += psize;
+ vs->vs_ops[vs_type]++;
+ vs->vs_bytes[vs_type] += psize;
if (flags & ZIO_FLAG_DELEGATED) {
vsx->vsx_agg_histo[zio->io_priority]
if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
return;
- mutex_enter(&vd->vdev_stat_lock);
- if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
- if (zio->io_error == ECKSUM)
- vs->vs_checksum_errors++;
- else
- vs->vs_read_errors++;
- }
- if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
- vs->vs_write_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
if (spa->spa_load_state == SPA_LOAD_NONE &&
type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
}
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
+ /* ensure we won't underflow */
+ if (alloc_delta < 0) {
+ ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
+ }
+
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
/* every class but log contributes to root space stats */
if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+ ASSERT(!vd->vdev_isl2cache);
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
}
}
+void
+vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
+{
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_set_deferred_resilver(spa, vd->vdev_child[i]);
+
+ if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) ||
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+ return;
+ }
+
+ vd->vdev_resilver_deferred = B_TRUE;
+ spa->spa_resilver_deferred = B_TRUE;
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the
+ * physical range to the logical range and start to
+ * unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical components by calling the
+ * vdev specific translate function.
+ */
+ range_seg_t intermediate = { { { 0, 0 } } };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
#if defined(_KERNEL)
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
+
/* BEGIN CSTYLED */
-module_param(vdev_max_ms_count, int, 0644);
-MODULE_PARM_DESC(vdev_max_ms_count,
+module_param(zfs_vdev_default_ms_count, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_default_ms_count,
"Target number of metaslabs per top-level vdev");
-module_param(vdev_min_ms_count, int, 0644);
-MODULE_PARM_DESC(vdev_min_ms_count,
+module_param(zfs_vdev_min_ms_count, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_min_ms_count,
"Minimum number of metaslabs per top-level vdev");
-module_param(vdev_ms_count_limit, int, 0644);
-MODULE_PARM_DESC(vdev_ms_count_limit,
+module_param(zfs_vdev_ms_count_limit, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_ms_count_limit,
"Practical upper limit of total metaslabs per top-level vdev");
-module_param(zfs_delays_per_second, uint, 0644);
-MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
- "IO delays per second");
+module_param(zfs_slow_io_events_per_second, uint, 0644);
+MODULE_PARM_DESC(zfs_slow_io_events_per_second,
+ "Rate limit slow IO (delay) events to this many per second");
-module_param(zfs_checksums_per_second, uint, 0644);
- MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events "
+module_param(zfs_checksum_events_per_second, uint, 0644);
+MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
"to this many checksum errors per second (do not set below zed"
"threshold).");
module_param(vdev_validate_skip, int, 0644);
MODULE_PARM_DESC(vdev_validate_skip,
"Bypass vdev_validate()");
+
+module_param(zfs_nocacheflush, int, 0644);
+MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes");
/* END CSTYLED */
#endif