#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
+ cvd->vdev_spa->spa_leaf_list_gen++;
+ }
}
void
pvd->vdev_children = 0;
}
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ spa_t *spa = cvd->vdev_spa;
+ list_remove(&spa->spa_leaf_list, cvd);
+ spa->spa_leaf_list_gen++;
+ }
+
/*
* Walk up all ancestors to update guid sum.
*/
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
+ list_link_init(&vd->vdev_leaf_node);
+ list_link_init(&vd->vdev_trim_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+ ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
- ASSERT(vd->vdev_initialize_thread == NULL);
/*
* Discard allocation state.
vdev_remove_child(vd->vdev_parent, vd);
ASSERT(vd->vdev_parent == NULL);
+ ASSERT(!list_link_active(&vd->vdev_leaf_node));
/*
* Clean up vdev structure.
mutex_destroy(&vd->vdev_initialize_io_lock);
cv_destroy(&vd->vdev_initialize_io_cv);
cv_destroy(&vd->vdev_initialize_cv);
+ mutex_destroy(&vd->vdev_trim_lock);
+ mutex_destroy(&vd->vdev_autotrim_lock);
+ mutex_destroy(&vd->vdev_trim_io_lock);
+ cv_destroy(&vd->vdev_trim_cv);
+ cv_destroy(&vd->vdev_autotrim_cv);
+ cv_destroy(&vd->vdev_trim_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
}
if (vd->vdev_ms != NULL) {
- uint64_t count = vd->vdev_ms_count;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
- metaslab_group_passivate(vd->vdev_mg);
+ uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
-
if (msp != NULL)
metaslab_fini(msp);
}
vd->vdev_ms = NULL;
vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
+ /*
+ * Physical volume size should never be larger than its max size, unless
+ * the disk has shrunk while we were reading it or the device is buggy
+ * or damaged: either way it's not safe for use, bail out of the open.
+ */
+ if (osize > max_osize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_OPEN_FAILED);
+ return (SET_ERROR(ENXIO));
+ }
+
/*
* Reset the vdev_reopening flag so that we actually close
* the vdev on error.
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
(u_longlong_t)vd->vdev_asize);
return (SET_ERROR(ENXIO));
- } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
"[error=%d]", error);
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
- if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+ error = space_map_open(&vd->vdev_checkpoint_sm,
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
- vd->vdev_ashift))) {
+ vd->vdev_ashift);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: space_map_open "
"failed for checkpoint spacemap (obj %llu) "
"[error=%d]",
}
mutex_exit(&vd->vdev_initialize_lock);
+ /* Restart trimming if necessary */
+ mutex_enter(&vd->vdev_trim_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_trim_thread == NULL &&
+ vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
+ (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
+ vd->vdev_trim_secure);
+ }
+ mutex_exit(&vd->vdev_trim_lock);
+
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
- int t;
- for (t = 0; t < ZIO_TYPES; t++) {
+ for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
/*
- * Report intializing progress. Since we don't
+ * Report initializing progress. Since we don't
* have the initializing locks held, this is only
* an estimate (although a fairly accurate one).
*/
vs->vs_initialize_state = vd->vdev_initialize_state;
vs->vs_initialize_action_time =
vd->vdev_initialize_action_time;
+
+ /*
+ * Report manual TRIM progress. Since we don't have
+ * the manual TRIM locks held, this is only an
+ * estimate (although fairly accurate one).
+ */
+ vs->vs_trim_notsup = !vd->vdev_has_trim;
+ vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+ vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+ vs->vs_trim_state = vd->vdev_trim_state;
+ vs->vs_trim_action_time = vd->vdev_trim_action_time;
}
/*
- * Report expandable space on top-level, non-auxillary devices
+ * Report expandable space on top-level, non-auxiliary devices
* only. The expandable space is reported in terms of metaslab
* sized units since that determines how much space the pool
* can expand.
vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
- ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0);
vdev_get_stats_ex_impl(vd, vs, vsx);
mutex_exit(&vd->vdev_stat_lock);
}
*/
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+ zio_type_t vs_type = type;
- vs->vs_ops[type]++;
- vs->vs_bytes[type] += psize;
+ /*
+ * TRIM ops and bytes are reported to user space as
+ * ZIO_TYPE_IOCTL. This is done to preserve the
+ * vdev_stat_t structure layout for user space.
+ */
+ if (type == ZIO_TYPE_TRIM)
+ vs_type = ZIO_TYPE_IOCTL;
+
+ vs->vs_ops[vs_type]++;
+ vs->vs_bytes[vs_type] += psize;
if (flags & ZIO_FLAG_DELEGATED) {
vsx->vsx_agg_histo[zio->io_priority]
if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
return;
- mutex_enter(&vd->vdev_stat_lock);
- if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
- if (zio->io_error == ECKSUM)
- vs->vs_checksum_errors++;
- else
- vs->vs_read_errors++;
- }
- if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
- vs->vs_write_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
if (spa->spa_load_state == SPA_LOAD_NONE &&
type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
}
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
spa->spa_resilver_deferred = B_TRUE;
}
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the
+ * physical range to the logical range and start to
+ * unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical components by calling the
+ * vdev specific translate function.
+ */
+ range_seg_t intermediate = { { { 0, 0 } } };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
#if defined(_KERNEL)
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
+
/* BEGIN CSTYLED */
module_param(zfs_vdev_default_ms_count, int, 0644);
MODULE_PARM_DESC(zfs_vdev_default_ms_count,