* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
* Copyright (c) 2021, Klara Inc.
- * Copyright [2021] Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
*/
#include <sys/zfs_context.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
#include "zfs_prop.h"
* 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
* (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
*/
-static int zfs_embedded_slog_min_ms = 64;
+static uint_t zfs_embedded_slog_min_ms = 64;
/* default target for number of metaslabs per top-level vdev */
-static int zfs_vdev_default_ms_count = 200;
+static uint_t zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */
-static int zfs_vdev_min_ms_count = 16;
+static uint_t zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */
-static int zfs_vdev_ms_count_limit = 1ULL << 17;
+static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */
-static int zfs_vdev_default_ms_shift = 29;
+static uint_t zfs_vdev_default_ms_shift = 29;
/* upper limit for metaslab size (16G) */
-static const int zfs_vdev_max_ms_shift = 34;
+static uint_t zfs_vdev_max_ms_shift = 34;
int vdev_validate_skip = B_FALSE;
*/
int zfs_nocacheflush = 0;
-uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
-uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+/*
+ * Maximum and minimum ashift values that can be automatically set based on
+ * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
+ * is higher than the maximum value, it is intentionally limited here to not
+ * excessively impact pool space efficiency. Higher ashift values may still
+ * be forced by vdev logical ashift or by user via ashift property, but won't
+ * be set automatically as a performance optimization.
+ */
+uint_t zfs_vdev_max_auto_ashift = 14;
+uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
* Virtual device management.
*/
-static const vdev_ops_t *const vdev_ops_table[] = {
+static vdev_ops_t *const vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
&vdev_draid_ops,
static vdev_ops_t *
vdev_getops(const char *type)
{
- const vdev_ops_t *ops, *const *opspp;
+ vdev_ops_t *ops, *const *opspp;
for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
if (strcmp(ops->vdev_op_type, type) == 0)
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
for (int c = 0; c < vd->vdev_children; c++) {
- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
asize = MAX(asize, csize);
}
return (nparity);
}
+static int
+vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t objid;
+ int err;
+
+ if (vd->vdev_root_zap != 0) {
+ objid = vd->vdev_root_zap;
+ } else if (vd->vdev_top_zap != 0) {
+ objid = vd->vdev_top_zap;
+ } else if (vd->vdev_leaf_zap != 0) {
+ objid = vd->vdev_leaf_zap;
+ } else {
+ return (EINVAL);
+ }
+
+ err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
+ sizeof (uint64_t), 1, value);
+
+ if (err == ENOENT)
+ *value = vdev_prop_default_numeric(prop);
+
+ return (err);
+}
+
/*
* Get the number of data disks for a top-level vdev.
*/
newchild = kmem_alloc(newsize, KM_SLEEP);
if (pvd->vdev_child != NULL) {
- bcopy(pvd->vdev_child, newchild, oldsize);
+ memcpy(newchild, pvd->vdev_child, oldsize);
kmem_free(pvd->vdev_child, oldsize);
}
zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1);
+ /*
+ * Default Thresholds for tuning ZED
+ */
+ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
+ vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
+ vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
+ vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
vdev_queue_init(vd);
- vdev_cache_init(vd);
return (vd);
}
int alloctype)
{
vdev_ops_t *ops;
- char *type;
+ const char *type;
uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
- char *tmp = NULL;
+ const char *tmp = NULL;
int rc;
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
boolean_t top_level = (parent && !parent->vdev_parent);
return (SET_ERROR(ENOTSUP));
if (top_level && alloctype == VDEV_ALLOC_ADD) {
- char *bias;
+ const char *bias;
/*
* If creating a top-level vdev, check for allocation
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
- vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
+ vd->vdev_path = spa_strdup(tmp);
/*
* ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
}
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
- vd->vdev_devid = spa_strdup(vd->vdev_devid);
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
- &vd->vdev_physpath) == 0)
- vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
+ vd->vdev_devid = spa_strdup(tmp);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
+ vd->vdev_physpath = spa_strdup(tmp);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
- &vd->vdev_enc_sysfs_path) == 0)
- vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
+ &tmp) == 0)
+ vd->vdev_enc_sysfs_path = spa_strdup(tmp);
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
- vd->vdev_fru = spa_strdup(vd->vdev_fru);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
+ vd->vdev_fru = spa_strdup(tmp);
/*
* Set the whole_disk property. If it's not specified, leave the value
&vd->vdev_not_present);
/*
- * Get the alignment requirement.
+ * Get the alignment requirement. Ignore pool ashift for vdev
+ * attach case.
*/
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+ if (alloctype != VDEV_ALLOC_ATTACH) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ &vd->vdev_ashift);
+ } else {
+ vd->vdev_attaching = B_TRUE;
+ }
/*
* Retrieve the vdev creation time.
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
&vd->vdev_crtxg);
+ if (vd->vdev_ops == &vdev_root_ops &&
+ (alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
+ &vd->vdev_root_zap);
+ }
+
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
&vd->vdev_removing);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
&vd->vdev_top_zap);
+ vd->vdev_rz_expanding = nvlist_exists(nv,
+ ZPOOL_CONFIG_RAIDZ_EXPANDING);
} else {
ASSERT0(vd->vdev_top_zap);
}
&vd->vdev_removed);
if (vd->vdev_faulted || vd->vdev_degraded) {
- char *aux;
+ const char *aux;
vd->vdev_label_aux =
VDEV_AUX_ERR_EXCEEDED;
* Clean up vdev structure.
*/
vdev_queue_fini(vd);
- vdev_cache_fini(vd);
if (vd->vdev_path)
spa_strfree(vd->vdev_path);
mutex_destroy(&vd->vdev_trim_io_lock);
cv_destroy(&vd->vdev_trim_cv);
cv_destroy(&vd->vdev_autotrim_cv);
+ cv_destroy(&vd->vdev_autotrim_kick_cv);
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
ASSERT(tvd == tvd->vdev_top);
- tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count;
vdev_free(mvd);
}
+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+ while (b != 0) {
+ uint64_t t = b;
+ b = a % b;
+ a = t;
+ }
+ return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
+static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
+ if (spa->spa_gcd_alloc == INT_MAX) {
+ spa->spa_gcd_alloc = min_alloc;
+ } else {
+ spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+ spa->spa_gcd_alloc);
+ }
+}
+
void
vdev_metaslab_group_create(vdev_t *vd)
{
spa->spa_min_ashift = vd->vdev_ashift;
uint64_t min_alloc = vdev_get_min_alloc(vd);
- if (min_alloc < spa->spa_min_alloc)
- spa->spa_min_alloc = min_alloc;
+ vdev_spa_set_alloc(spa, min_alloc);
}
}
}
mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (expanding) {
- bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
- /*
- * Regardless whether this vdev was just added or it is being
- * expanded, the metaslab count has changed. Recalculate the
- * block limit.
- */
- spa_log_sm_set_blocklimit(spa);
-
return (0);
}
}
}
ASSERT0(vd->vdev_ms_count);
- ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
}
typedef struct vdev_probe_stats {
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
+ vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+ vd->vdev_cant_read, vd->vdev_cant_write);
if (vdev_readable(vd) &&
(vdev_writeable(vd) || !spa_writeable(spa))) {
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
- ZIO_FLAG_TRYHARD;
+ ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
}
/*
- * Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's. We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
*/
static void
vdev_set_deflate_ratio(vdev_t *vd)
{
if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
vd->vdev_deflate_ratio = (1 << 17) /
- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+ SPA_MINBLOCKSHIFT);
}
}
+/*
+ * Choose the best of two ashifts, preferring one between logical ashift
+ * (absolute minimum) and administrator defined maximum, otherwise take
+ * the biggest of the two.
+ */
+uint64_t
+vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
+{
+ if (a > logical && a <= zfs_vdev_max_auto_ashift) {
+ if (b <= logical || b > zfs_vdev_max_auto_ashift)
+ return (a);
+ else
+ return (MAX(a, b));
+ } else if (b <= logical || b > zfs_vdev_max_auto_ashift)
+ return (MAX(a, b));
+ return (b);
+}
+
/*
* Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining
{
ASSERT(vd == vd->vdev_top);
- if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ if (vd->vdev_ashift < vd->vdev_physical_ashift &&
+ vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
vd->vdev_ashift = MIN(
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_vdev_min_auto_ashift,
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
&logical_ashift, &physical_ashift);
+
+ /* Keep the device in removed state if unplugged */
+ if (error == ENOENT && vd->vdev_removed) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
+ VDEV_AUX_NONE);
+ return (error);
+ }
+
/*
* Physical volume size should never be larger than its max size, unless
* the disk has shrunk while we were reading it or the device is buggy
return (SET_ERROR(EDOM));
}
- if (vd->vdev_top == vd) {
+ if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
vdev_ashift_optimize(vd);
- }
+ vd->vdev_attaching = B_FALSE;
}
if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
vd->vdev_ashift > ASHIFT_MAX)) {
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
uint64_t min_alloc = vdev_get_min_alloc(vd);
- if (min_alloc < spa->spa_min_alloc)
- spa->spa_min_alloc = min_alloc;
+ vdev_spa_set_alloc(spa, min_alloc);
}
/*
}
static void
-vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
-{
- char *old, *new;
- if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
- if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
- zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
- "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
- dvd->vdev_path, svd->vdev_path);
- spa_strfree(dvd->vdev_path);
- dvd->vdev_path = spa_strdup(svd->vdev_path);
+vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
+{
+ if (svd != NULL && *dvd != NULL) {
+ if (strcmp(svd, *dvd) != 0) {
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
+ "from '%s' to '%s'", (u_longlong_t)guid, prefix,
+ *dvd, svd);
+ spa_strfree(*dvd);
+ *dvd = spa_strdup(svd);
}
- } else if (svd->vdev_path != NULL) {
- dvd->vdev_path = spa_strdup(svd->vdev_path);
+ } else if (svd != NULL) {
+ *dvd = spa_strdup(svd);
zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
- (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+ (u_longlong_t)guid, *dvd);
}
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+ char *old, *new;
+
+ vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
+ dvd->vdev_guid);
+
+ vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
+ dvd->vdev_guid);
+
+ vdev_update_path("vdev_physpath", svd->vdev_physpath,
+ &dvd->vdev_physpath, dvd->vdev_guid);
/*
* Our enclosure sysfs path may have changed between imports
vd->vdev_ops->vdev_op_close(vd);
- vdev_cache_purge(vd);
-
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
(void) vdev_validate(vd);
}
+ /*
+ * Recheck if resilver is still needed and cancel any
+ * scheduled resilver if resilver is unneeded.
+ */
+ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
+ spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
+ mutex_exit(&spa->spa_async_lock);
+ }
+
/*
* Reassess parent vdev's health.
*/
if (txg != 0)
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
- return;
+ } else {
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+ if (t == DTL_SCRUB) {
+ /* leaf vdevs only */
+ continue;
+ }
+ if (t == DTL_PARTIAL) {
+ /* i.e. non-zero */
+ minref = 1;
+ } else if (vdev_get_nparity(vd) != 0) {
+ /* RAIDZ, DRAID */
+ minref = vdev_get_nparity(vd) + 1;
+ } else {
+ /* any kind of mirror */
+ minref = vd->vdev_children;
+ }
+ space_reftree_create(&reftree);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_dtl_lock);
+ space_reftree_add_map(&reftree,
+ cvd->vdev_dtl[s], 1);
+ mutex_exit(&cvd->vdev_dtl_lock);
+ }
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
}
- mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
- /* account for child's outage in parent's missing map */
- int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
- if (t == DTL_SCRUB)
- continue; /* leaf vdevs only */
- if (t == DTL_PARTIAL)
- minref = 1; /* i.e. non-zero */
- else if (vdev_get_nparity(vd) != 0)
- minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
- else
- minref = vd->vdev_children; /* any kind of mirror */
- space_reftree_create(&reftree);
- for (int c = 0; c < vd->vdev_children; c++) {
- vdev_t *cvd = vd->vdev_child[c];
- mutex_enter(&cvd->vdev_dtl_lock);
- space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
- mutex_exit(&cvd->vdev_dtl_lock);
- }
- space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
- space_reftree_destroy(&reftree);
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+ raidz_dtl_reassessed(vd);
}
- mutex_exit(&vd->vdev_dtl_lock);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and post kobj events
+ */
+void
+vdev_post_kobj_evt(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_kobj_evt_post &&
+ vd->vdev_kobj_flag == B_FALSE) {
+ vd->vdev_kobj_flag = B_TRUE;
+ vd->vdev_ops->vdev_op_kobj_evt_post(vd);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_post_kobj_evt(vd->vdev_child[c]);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and clear kobj events
+ */
+void
+vdev_clear_kobj_evt(vdev_t *vd)
+{
+ vd->vdev_kobj_flag = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_clear_kobj_evt(vd->vdev_child[c]);
}
int
vdev_zap_allocation_data(vd, tx);
}
}
+ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
+ spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
+ if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
+ spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
+ vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
+ }
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
vdev_set_deflate_ratio(vd);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ error = vdev_raidz_load(vd);
+ if (error != 0)
+ return (error);
+ }
+
/*
* On spa_load path, grab the allocation bias from our zap
*/
}
}
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ uint64_t failfast;
+
+ error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
+ 1, &failfast);
+ if (error == 0) {
+ vd->vdev_failfast = failfast & 1;
+ } else if (error == ENOENT) {
+ vd->vdev_failfast = vdev_prop_default_numeric(
+ VDEV_PROP_FAILFAST);
+ } else {
+ vdev_dbgmsg(vd,
+ "vdev_load: zap_lookup(top_zap=%llu) "
+ "failed [error=%d]",
+ (u_longlong_t)vd->vdev_top_zap, error);
+ }
+ }
+
/*
* Load any rebuild state from the top-level vdev zap.
*/
}
}
+ if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
+ uint64_t zapobj;
+
+ if (vd->vdev_top_zap != 0)
+ zapobj = vd->vdev_top_zap;
+ else
+ zapobj = vd->vdev_leaf_zap;
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
+ &vd->vdev_checksum_n);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
+ &vd->vdev_checksum_t);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
+ &vd->vdev_io_n);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
+ &vd->vdev_io_t);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+ }
+
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
dmu_tx_commit(tx);
}
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
- return (vd->vdev_ops->vdev_op_asize(vd, psize));
+ return (vdev_psize_to_asize_txg(vd, psize, 0));
}
/*
return (spa_vdev_state_exit(spa, vd, 0));
}
+int
+vdev_remove_wanted(spa_t *spa, uint64_t guid)
+{
+ vdev_t *vd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ /*
+ * If the vdev is already removed, or expanding which can trigger
+ * repartition add/remove events, then don't do anything.
+ */
+ if (vd->vdev_removed || vd->vdev_expanding)
+ return (spa_vdev_state_exit(spa, NULL, 0));
+
+ /*
+ * Confirm the vdev has been removed, otherwise don't do anything.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_REMOVE);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+
/*
* Online the given vdev.
*
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
/* XXX - L2ARC 1.0 does not support expansion */
if (vd->vdev_aux)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa->spa_ccw_fail_time = 0;
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
- vd->vdev_state >= VDEV_STATE_DEGRADED))
+ vd->vdev_state >= VDEV_STATE_DEGRADED)) {
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
+ /*
+ * Asynchronously detach spare vdev if resilver or
+ * rebuild is not required
+ */
+ if (vd->vdev_unspare &&
+ !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
+ !vdev_rebuild_active(tvd))
+ spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
vdev_clear(spa, vd->vdev_child[c]);
/*
- * It makes no sense to "clear" an indirect vdev.
+ * It makes no sense to "clear" an indirect or removed vdev.
*/
- if (!vdev_is_concrete(vd))
+ if (!vdev_is_concrete(vd) || vd->vdev_removed)
return;
/*
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
- for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
- vsx->vsx_active_queue[t] =
- vd->vdev_queue.vq_class[t].vqc_active;
- vsx->vsx_pend_queue[t] = avl_numnodes(
- &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+ for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+ vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+ vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
vdev_t *tvd = vd->vdev_top;
mutex_enter(&vd->vdev_stat_lock);
if (vs) {
- bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ memcpy(vs, &vd->vdev_stat, sizeof (*vs));
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
vs->vs_configured_ashift = vd->vdev_top != NULL
? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
vs->vs_logical_ashift = vd->vdev_logical_ashift;
- vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ if (vd->vdev_physical_ashift <= ASHIFT_MAX)
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ else
+ vs->vs_physical_ashift = 0;
/*
* Report fragmentation and rebuild progress for top-level,
vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
vdev_t *pvd;
uint64_t txg = zio->io_txg;
+/* Suppress ASAN false positive */
+#ifdef __SANITIZE_ADDRESS__
vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
+#else
+ vdev_stat_t *vs = &vd->vdev_stat;
+ vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
+#endif
zio_type_t type = zio->io_type;
int flags = zio->io_flags;
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+ vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+ (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
vdev_is_concrete(vd)) {
vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
{
vdev_t *cvd, *pvd = vd->vdev_parent;
+ VERIFY3U(pvd->vdev_children, >, 1);
+
vdev_remove_child(pvd, vd);
vdev_compact_children(pvd);
+ ASSERT3P(pvd->vdev_child, !=, NULL);
+
cvd = pvd->vdev_child[0];
if (pvd->vdev_children == 1) {
vdev_remove_parent(cvd);
}
void
-vdev_deadman(vdev_t *vd, char *tag)
+vdev_deadman(vdev_t *vd, const char *tag)
{
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
- if (avl_numnodes(&vq->vq_active_tree) > 0) {
+ if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
- zfs_dbgmsg("slow vdev: %s has %lu active IOs",
- vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+ zfs_dbgmsg("slow vdev: %s has %u active IOs",
+ vd->vdev_path, vq->vq_active);
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
- fio = avl_first(&vq->vq_active_tree);
+ fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
* Add a (source=src, propname=propval) list to an nvlist.
*/
static void
-vdev_prop_add_list(nvlist_t *nvl, const char *propname, char *strval,
+vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
uint64_t intval, zprop_source_t src)
{
nvlist_t *propval;
objset_t *mos = spa->spa_meta_objset;
nvpair_t *elem = NULL;
uint64_t vdev_guid;
+ uint64_t objid;
nvlist_t *nvprops;
vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
if (vd == NULL)
return;
+ /*
+ * Set vdev property values in the vdev props mos object.
+ */
+ if (vd->vdev_root_zap != 0) {
+ objid = vd->vdev_root_zap;
+ } else if (vd->vdev_top_zap != 0) {
+ objid = vd->vdev_top_zap;
+ } else if (vd->vdev_leaf_zap != 0) {
+ objid = vd->vdev_leaf_zap;
+ } else {
+ panic("unexpected vdev type");
+ }
+
mutex_enter(&spa->spa_props_lock);
while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
- uint64_t intval, objid = 0;
- char *strval;
+ uint64_t intval;
+ const char *strval;
vdev_prop_t prop;
const char *propname = nvpair_name(elem);
zprop_type_t proptype;
- /*
- * Set vdev property values in the vdev props mos object.
- */
- if (vd->vdev_top_zap != 0) {
- objid = vd->vdev_top_zap;
- } else if (vd->vdev_leaf_zap != 0) {
- objid = vd->vdev_leaf_zap;
- } else {
- panic("vdev not top or leaf");
- }
-
switch (prop = vdev_name_to_prop(propname)) {
- case VDEV_PROP_USER:
+ case VDEV_PROP_USERPROP:
if (vdev_prop_user(propname)) {
strval = fnvpair_value_string(elem);
if (strlen(strval) == 0) {
nvpair_t *elem = NULL;
uint64_t vdev_guid;
nvlist_t *nvprops;
- int error;
+ int error = 0;
ASSERT(vd != NULL);
+ /* Check that vdev has a zap we can use */
+ if (vd->vdev_root_zap == 0 &&
+ vd->vdev_top_zap == 0 &&
+ vd->vdev_leaf_zap == 0)
+ return (SET_ERROR(EINVAL));
+
if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
&vdev_guid) != 0)
return (SET_ERROR(EINVAL));
return (SET_ERROR(EINVAL));
while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
- char *propname = nvpair_name(elem);
+ const char *propname = nvpair_name(elem);
vdev_prop_t prop = vdev_name_to_prop(propname);
uint64_t intval = 0;
- char *strval = NULL;
+ const char *strval = NULL;
- if (prop == VDEV_PROP_USER && !vdev_prop_user(propname)) {
+ if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
error = EINVAL;
goto end;
}
else
error = spa_vdev_alloc(spa, vdev_guid);
break;
+ case VDEV_PROP_FAILFAST:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_failfast = intval & 1;
+ break;
+ case VDEV_PROP_CHECKSUM_N:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_checksum_n = intval;
+ break;
+ case VDEV_PROP_CHECKSUM_T:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_checksum_t = intval;
+ break;
+ case VDEV_PROP_IO_N:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_io_n = intval;
+ break;
+ case VDEV_PROP_IO_T:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_io_t = intval;
+ break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
- if (vd->vdev_top_zap != 0) {
+ if (vd->vdev_root_zap != 0) {
+ objid = vd->vdev_root_zap;
+ } else if (vd->vdev_top_zap != 0) {
objid = vd->vdev_top_zap;
} else if (vd->vdev_leaf_zap != 0) {
objid = vd->vdev_leaf_zap;
KM_SLEEP);
for (uint64_t i = 0; i < vd->vdev_children;
i++) {
- char *vname;
+ const char *vname;
vname = vdev_name(vd->vdev_child[i],
namebuf, sizeof (namebuf));
vdev_prop_add_list(outnvl, propname, NULL,
vd->vdev_removing, ZPROP_SRC_NONE);
continue;
+ case VDEV_PROP_RAIDZ_EXPANDING:
+ /* Only expose this for raidz */
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vd->vdev_rz_expanding,
+ ZPROP_SRC_NONE);
+ }
+ continue;
/* Numeric Properites */
case VDEV_PROP_ALLOCATING:
+ /* Leaf vdevs cannot have this property */
+ if (vd->vdev_mg == NULL &&
+ vd->vdev_top != NULL) {
+ src = ZPROP_SRC_NONE;
+ intval = ZPROP_BOOLEAN_NA;
+ } else {
+ err = vdev_prop_get_int(vd, prop,
+ &intval);
+ if (err && err != ENOENT)
+ break;
+
+ if (intval ==
+ vdev_prop_default_numeric(prop))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+ }
+
+ vdev_prop_add_list(outnvl, propname, NULL,
+ intval, src);
+ break;
+ case VDEV_PROP_FAILFAST:
src = ZPROP_SRC_LOCAL;
strval = NULL;
err = zap_lookup(mos, objid, nvpair_name(elem),
sizeof (uint64_t), 1, &intval);
if (err == ENOENT) {
- intval =
- vdev_prop_default_numeric(prop);
+ intval = vdev_prop_default_numeric(
+ prop);
err = 0;
- } else if (err)
+ } else if (err) {
break;
+ }
if (intval == vdev_prop_default_numeric(prop))
src = ZPROP_SRC_DEFAULT;
- /* Leaf vdevs cannot have this property */
- if (vd->vdev_mg == NULL &&
- vd->vdev_top != NULL) {
- src = ZPROP_SRC_NONE;
- intval = ZPROP_BOOLEAN_NA;
- }
-
vdev_prop_add_list(outnvl, propname, strval,
intval, src);
break;
+ case VDEV_PROP_CHECKSUM_N:
+ case VDEV_PROP_CHECKSUM_T:
+ case VDEV_PROP_IO_N:
+ case VDEV_PROP_IO_T:
+ err = vdev_prop_get_int(vd, prop, &intval);
+ if (err && err != ENOENT)
+ break;
+
+ if (intval == vdev_prop_default_numeric(prop))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+
+ vdev_prop_add_list(outnvl, propname, NULL,
+ intval, src);
+ break;
/* Text Properties */
case VDEV_PROP_COMMENT:
/* Exists in the ZAP below */
/* FALLTHRU */
- case VDEV_PROP_USER:
+ case VDEV_PROP_USERPROP:
/* User Properites */
src = ZPROP_SRC_LOCAL;
strval = NULL;
zprop_source_t src = ZPROP_SRC_DEFAULT;
propname = za.za_name;
- prop = vdev_name_to_prop(propname);
switch (za.za_integer_length) {
case 8:
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
"Target number of metaslabs per top-level vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
- "Default limit for metaslab size");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
+ "Default lower limit for metaslab size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
+ "Default upper limit for metaslab size");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
"Minimum number of metaslabs per top-level vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
"Practical upper limit of total metaslabs per top-level vdev");
ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
"Disable cache flushes");
-ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
"Minimum number of metaslabs required to dedicate one for log blocks");
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
- param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+ param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
"Minimum ashift used when creating new top-level vdevs");
ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
- param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+ param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
"Maximum ashift used when optimizing for logical -> physical sector "
"size on new top-level vdevs");
/* END CSTYLED */