*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/arc.h>
+#include <sys/zil.h>
+#include <sys/dsl_scan.h>
/*
* Virtual device management.
&vdev_disk_ops,
&vdev_file_ops,
&vdev_missing_ops,
+ &vdev_hole_ops,
NULL
};
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
- uint64_t c;
+ int c;
for (c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
}
/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- * psize rsize
- * root - -
- * mirror/raidz - -
- * disk1 20g 20g
- * disk2 40g 20g
- * disk3 80g 80g
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
*/
uint64_t
-vdev_get_rsize(vdev_t *vd)
+vdev_get_min_asize(vdev_t *vd)
{
- vdev_t *pvd, *cvd;
- uint64_t c, rsize;
+ vdev_t *pvd = vd->vdev_parent;
- pvd = vd->vdev_parent;
+ /*
+ * The our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
/*
- * If our parent is NULL or the root, just return our own psize.
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
*/
- if (pvd == NULL || pvd->vdev_parent == NULL)
- return (vd->vdev_psize);
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
- rsize = 0;
+ /*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+ * so each child must provide at least 1/Nth of its asize.
+ */
+ if (pvd->vdev_ops == &vdev_raidz_ops)
+ return (pvd->vdev_min_asize / pvd->vdev_children);
- for (c = 0; c < pvd->vdev_children; c++) {
- cvd = pvd->vdev_child[c];
- rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
- }
+ return (pvd->vdev_min_asize);
+}
- return (rsize);
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+ int c;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
}
vdev_t *
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
- int c;
vdev_t *mvd;
+ int c;
if (vd->vdev_guid == guid)
return (vd);
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
}
void
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
}
/*
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
- int newc, c;
+ int newc;
+ int c;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
/*
* Allocate and minimally initialize a vdev_t.
*/
-static vdev_t *
+vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
+ int t;
vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
spa->spa_root_vdev = vd;
}
- if (guid == 0) {
+ if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
* which must be unique among all pools.
*/
- while (guid == 0 || spa_guid_exists(guid, 0))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(NULL);
} else {
/*
* Any other vdev's guid must be unique within the pool.
*/
- while (guid == 0 ||
- spa_guid_exists(spa_guid(spa), guid))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(spa);
}
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
}
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
- for (int t = 0; t < DTL_TYPES; t++) {
+ for (t = 0; t < DTL_TYPES; t++) {
space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
&vd->vdev_dtl_lock);
}
} else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
}
/*
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (ENOTSUP);
+
/*
* Set the nparity property for RAID-Z vdevs.
*/
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
- /*
- * Currently, we can only support 2 parity devices.
- */
- if (nparity == 0 || nparity > 2)
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (EINVAL);
/*
- * Older versions can only support 1 parity device.
+ * Previous versions could only support 1 or 2 parity
+ * device.
*/
- if (nparity == 2 &&
- spa_version(spa) < SPA_VERSION_RAID6)
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (ENOTSUP);
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
return (ENOTSUP);
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
- if (spa_version(spa) >= SPA_VERSION_RAID6)
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (EINVAL);
/*
* Otherwise, we default to 1 parity device for RAID-Z.
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
&vd->vdev_physpath) == 0)
vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+ vd->vdev_fru = spa_strdup(vd->vdev_fru);
/*
* Set the whole_disk property. If it's not specified, leave the value
* Look for the 'not present' flag. This will only be set if the device
* was not present at the time of import.
*/
- if (!spa->spa_import_faulted)
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
- &vd->vdev_not_present);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
/*
* Get the alignment requirement.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+ /*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ if (parent && !parent->vdev_parent &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
+ }
+
+ if (parent && !parent->vdev_parent) {
+ ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_ADD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL);
+ vd->vdev_mg = metaslab_group_create(islog ?
+ spa_log_class(spa) : spa_normal_class(spa), vd);
}
/*
* If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf &&
- (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
+
+ if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ uint64_t spare = 0;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare) == 0 && spare)
+ spa_spare_add(vd);
+ }
+
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ &vd->vdev_resilvering);
+
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
- * valid in the current context.
+ * valid in the current context. Local vdevs will
+ * remain in the faulted state.
*/
- if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ if (spa_load_state(spa) == SPA_LOAD_OPEN) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
&vd->vdev_faulted);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
&vd->vdev_degraded);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
&vd->vdev_removed);
+
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ char *aux;
+
+ vd->vdev_label_aux =
+ VDEV_AUX_ERR_EXCEEDED;
+ if (nvlist_lookup_string(nv,
+ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+ strcmp(aux, "external") == 0)
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ }
}
}
void
vdev_free(vdev_t *vd)
{
- int c;
+ int c, t;
spa_t *spa = vd->vdev_spa;
/*
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
/*
* Free all children.
/*
* Discard allocation state.
*/
- if (vd == vd->vdev_top)
+ if (vd->vdev_mg != NULL) {
vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ }
ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
spa_strfree(vd->vdev_devid);
if (vd->vdev_physpath)
spa_strfree(vd->vdev_physpath);
+ if (vd->vdev_fru)
+ spa_strfree(vd->vdev_fru);
if (vd->vdev_isspare)
spa_spare_remove(vd);
txg_list_destroy(&vd->vdev_dtl_list);
mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
+ for (t = 0; t < DTL_TYPES; t++) {
space_map_unload(&vd->vdev_dtl[t]);
space_map_destroy(&vd->vdev_dtl[t]);
}
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
*/
if (mvd->vdev_top == mvd) {
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+ cvd->vdev_orig_guid = cvd->vdev_guid;
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
- metaslab_class_t *mc;
uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
return (0);
- ASSERT(oldc <= newc);
+ ASSERT(!vd->vdev_ishole);
- if (vd->vdev_islog)
- mc = spa->spa_log_class;
- else
- mc = spa->spa_normal_class;
+ /*
+ * Compute the raidz-deflation ratio. Note, we hard-code
+ * in 128k (1 << 17) because it is the current "typical" blocksize.
+ * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
+ * or we will inconsistently account for existing bp's.
+ */
+ vd->vdev_deflate_ratio = (1 << 17) /
+ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
- if (vd->vdev_mg == NULL)
- vd->vdev_mg = metaslab_group_create(mc, vd);
+ ASSERT(oldc <= newc);
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (txg == 0) {
uint64_t object = 0;
error = dmu_read(mos, vd->vdev_ms_array,
- m * sizeof (uint64_t), sizeof (uint64_t), &object);
+ m * sizeof (uint64_t), sizeof (uint64_t), &object,
+ DMU_READ_PREFETCH);
if (error)
return (error);
if (object != 0) {
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
}
+ if (txg == 0)
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (oldc == 0 && !vd->vdev_removing)
+ metaslab_group_activate(vd->vdev_mg);
+
+ if (txg == 0)
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
return (0);
}
uint64_t count = vd->vdev_ms_count;
if (vd->vdev_ms != NULL) {
+ metaslab_group_passivate(vd->vdev_mg);
for (m = 0; m < count; m++)
if (vd->vdev_ms[m] != NULL)
metaslab_fini(vd->vdev_ms[m]);
spa_t *spa = vd->vdev_spa;
vdev_probe_stats_t *vps = NULL;
zio_t *pio;
+ int l;
ASSERT(vd->vdev_ops->vdev_op_leaf);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
- ZIO_FLAG_DONT_RETRY;
+ ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+ /*
+ * We can't change the vdev state in this context, so we
+ * kick off an async task to do it on our behalf.
+ */
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
return (NULL);
}
- for (int l = 1; l < VDEV_LABELS; l++) {
+ for (l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
- offsetof(vdev_label_t, vl_pad)),
- VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
+ offsetof(vdev_label_t, vl_pad2)),
+ VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
return (NULL);
}
+static void
+vdev_open_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_open_thread = curthread;
+ vd->vdev_open_error = vdev_open(vd);
+ vd->vdev_open_thread = NULL;
+}
+
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+ int c;
+
+ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ strlen(ZVOL_DIR)) == 0)
+ return (B_TRUE);
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+ taskq_t *tq;
+ int children = vd->vdev_children;
+ int c;
+
+ /*
+ * in order to handle pools on top of zvols, do the opens
+ * in a single thread so that the same thread holds the
+ * spa_namespace_lock
+ */
+ if (vdev_uses_zvols(vd)) {
+ for (c = 0; c < children; c++)
+ vd->vdev_child[c]->vdev_open_error =
+ vdev_open(vd->vdev_child[c]);
+ return;
+ }
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+
+ for (c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+ TQ_SLEEP) != 0);
+
+ taskq_destroy(tq);
+}
+
/*
* Prepare a virtual device for access.
*/
{
spa_t *spa = vd->vdev_spa;
int error;
- int c;
uint64_t osize = 0;
uint64_t asize, psize;
uint64_t ashift = 0;
+ int c;
- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE);
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
+ /*
+ * If this vdev is not removed, check its fault status. If it's
+ * faulted, bail out of the open.
+ */
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
- VDEV_AUX_ERR_EXCEEDED);
+ vd->vdev_label_aux);
return (ENXIO);
} else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+ /*
+ * Reset the vdev_reopening flag so that we actually close
+ * the vdev on error.
+ */
+ vd->vdev_reopening = B_FALSE;
if (zio_injection_enabled && error == 0)
- error = zio_handle_device_injection(vd, ENXIO);
+ error = zio_handle_device_injection(vd, NULL, ENXIO);
if (error) {
if (vd->vdev_removed &&
vd->vdev_removed = B_FALSE;
+ /*
+ * Recheck the faulted flag now that we have confirmed that
+ * the vdev is accessible. If we're faulted, bail.
+ */
+ if (vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (ENXIO);
+ }
+
if (vd->vdev_degraded) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
- vd->vdev_state = VDEV_STATE_HEALTHY;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
}
- for (c = 0; c < vd->vdev_children; c++)
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
+ for (c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
break;
}
+ }
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
vd->vdev_psize = psize;
+ /*
+ * Make sure the allocatable size hasn't shrunk.
+ */
+ if (asize < vd->vdev_min_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
+ }
- /*
- * Make sure the device hasn't shrunk.
- */
- if (asize < vd->vdev_asize) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (EINVAL);
- }
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth. If automatic
+ * expansion is enabled then use the additional space.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
+ (vd->vdev_expanding || spa->spa_autoexpand))
+ vd->vdev_asize = asize;
- /*
- * If all children are healthy and the asize has increased,
- * then we've experienced dynamic LUN growth.
- */
- if (vd->vdev_state == VDEV_STATE_HEALTHY &&
- asize > vd->vdev_asize) {
- vd->vdev_asize = asize;
- }
- }
+ vdev_set_min_asize(vd);
/*
* Ensure we can issue some IO before declaring the
*/
if (vd->vdev_ops->vdev_op_leaf &&
(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_IO_FAILURE);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
return (error);
}
- /*
- * If this is a top-level vdev, compute the raidz-deflation
- * ratio. Note, we hard-code in 128k (1<<17) because it is the
- * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
- * changes, this algorithm must never change, or we will
- * inconsistently account for existing bp's.
- */
- if (vd->vdev_top == vd) {
- vd->vdev_deflate_ratio = (1<<17) /
- (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
- }
-
/*
* If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a scrub,
vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- int c;
nvlist_t *label;
- uint64_t guid, top_guid;
+ uint64_t guid = 0, top_guid;
uint64_t state;
+ int c;
for (c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ uint64_t aux_guid = 0;
+ nvlist_t *nvl;
if ((label = vdev_label_read_config(vd)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
return (0);
}
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ return (0);
+ }
+
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&guid) != 0 || guid != spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
return (0);
}
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
- (vd->vdev_guid != guid &&
+ ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
- if (spa->spa_load_state == SPA_LOAD_OPEN &&
+ /*
+ * If this is a verbatim import, no need to check the
+ * state of the pool.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (EBADF);
vdev_close(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ vdev_t *pvd = vd->vdev_parent;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /*
+ * If our parent is reopening, then we are as well, unless we are
+ * going offline.
+ */
+ if (pvd != NULL && pvd->vdev_reopening)
+ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
/*
- * We record the previous state before we close it, so that if we are
+ * We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted.
*/
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
}
+void
+vdev_hold(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+ int c;
+
+ ASSERT(spa_is_root(vd->vdev_spa));
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
void
vdev_reopen(vdev_t *vd)
{
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /* set the reopening flag unless we're taking the vdev offline */
+ vd->vdev_reopening = !vd->vdev_offline;
vdev_close(vd);
(void) vdev_open(vd);
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
- !l2arc_vdev_present(vd)) {
- uint64_t size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ vd->vdev_aux == &spa->spa_l2cache &&
+ !l2arc_vdev_present(vd))
+ l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd);
}
return (0);
}
-/*
- * The is the latter half of vdev_create(). It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
void
-vdev_init(vdev_t *vd, uint64_t txg)
+vdev_metaslab_set_size(vdev_t *vd)
{
/*
* Aim for roughly 200 metaslabs per vdev.
*/
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
- /*
- * Initialize the vdev's metaslabs. This can't fail because
- * there's nothing to read when creating all new metaslabs.
- */
- VERIFY(vdev_metaslab_init(vd, txg) == 0);
}
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
+ ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
+ ASSERT(spa_writeable(vd->vdev_spa));
if (flags & VDD_METASLAB)
(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication. There are three kinds of DTL:
+ * the vdev has less than perfect replication. There are four kinds of DTL:
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+ ASSERT(spa_writeable(vd->vdev_spa));
mutex_enter(sm->sm_lock);
if (!space_map_contains(sm, txg, size))
{
spa_t *spa = vd->vdev_spa;
avl_tree_t reftree;
- int minref;
+ int c, t, minref;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 &&
- (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
- /* XXX should check scrub_done? */
+ (spa->spa_scrub_started ||
+ (scn && scn->scn_phys.scn_errors == 0))) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
}
mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
+ for (t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
if (t == DTL_SCRUB)
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
else
minref = vd->vdev_children; /* any kind of mirror */
space_map_ref_create(&reftree);
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
- space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+ space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
mutex_exit(&cvd->vdev_dtl_lock);
}
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
if (smo->smo_object == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
dmu_buf_t *db;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ if (!required && zio_injection_enabled)
+ required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
return (required);
}
boolean_t needed = B_FALSE;
uint64_t thismin = UINT64_MAX;
uint64_t thismax = 0;
+ int c;
if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock);
}
mutex_exit(&vd->vdev_dtl_lock);
} else {
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
uint64_t cmin, cmax;
void
vdev_load(vdev_t *vd)
{
+ int c;
+
/*
* Recursively load all children.
*/
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_load(vd->vdev_child[c]);
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top &&
+ if (vd == vd->vdev_top && !vd->vdev_ishole &&
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
return (0);
}
+void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
+ int m;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+}
+
void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
+ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+ ASSERT(!vd->vdev_ishole);
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
+
+ if (reassess)
+ metaslab_sync_reassess(vd->vdev_mg);
}
void
metaslab_t *msp;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
dmu_tx_commit(tx);
}
+ /*
+ * Remove the metadata associated with this vdev once it's empty.
+ */
+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove(vd, txg);
+
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
* not be opened, and no I/O is attempted.
*/
int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
+
+ /*
+ * We don't directly use the aux state here, but if we do a
+ * vdev_reopen(), we need this value to be present to remember why we
+ * were faulted.
+ */
+ vd->vdev_label_aux = aux;
+
/*
* Faulted state takes precedence over degraded.
*/
+ vd->vdev_delayed_close = B_FALSE;
vd->vdev_faulted = 1ULL;
vd->vdev_degraded = 0ULL;
- vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
/*
- * If marking the vdev as faulted cause the top-level vdev to become
- * unavailable, then back off and simply mark the vdev as degraded
- * instead.
+ * If this device has the only valid copy of the data, then
+ * back off and simply mark the vdev as degraded instead.
*/
- if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
* If we reopen the device and it's not dead, only then do we
* mark it degraded.
*/
- vdev_reopen(vd);
+ vdev_reopen(tvd);
- if (vdev_readable(vd)) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
- }
+ if (vdev_readable(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
}
return (spa_vdev_state_exit(spa, vd, 0));
* as I/O is concerned.
*/
int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
vd->vdev_degraded = 1ULL;
if (!vdev_is_dead(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
+ aux);
return (spa_vdev_state_exit(spa, vd, 0));
}
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
- vdev_reopen(vd->vdev_top);
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+ }
+
+ vdev_reopen(tvd);
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = B_FALSE;
+ }
+
if (newstate)
*newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) &&
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
+ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (vd->vdev_aux)
+ return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
- spa_vdev_state_enter(spa);
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
+
/*
* If the device isn't already offline, try to offline it.
*/
if (!vd->vdev_offline) {
/*
* If this device has the only valid copy of some data,
- * don't allow it to be offlined.
+ * don't allow it to be offlined. Log devices are always
+ * expendable.
*/
- if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, EBUSY));
+ /*
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ metaslab_group_passivate(mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = spa_offline_log(spa);
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_group_activate(mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+ }
+
/*
* Offline this device and reopen its top-level vdev.
- * If this action results in the top-level vdev becoming
- * unusable, undo it and fail the request.
+ * If the top-level vdev is a log device then just offline
+ * it. Otherwise, if this action results in the top-level
+ * vdev becoming unusable, undo it and fail the request.
*/
vd->vdev_offline = B_TRUE;
- vdev_reopen(vd->vdev_top);
- if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
+ vdev_reopen(tvd);
+
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_is_dead(tvd)) {
vd->vdev_offline = B_FALSE;
- vdev_reopen(vd->vdev_top);
+ vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_group_activate(mg);
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
return (spa_vdev_state_exit(spa, vd, 0));
}
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ int error;
+
+ mutex_enter(&spa->spa_vdev_top_lock);
+ error = vdev_offline_locked(spa, guid, flags);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
+}
+
/*
* Clear the error counts associated with this vdev. Unlike vdev_online() and
* vdev_offline(), we assume the spa config is locked. We also clear all
vdev_clear(spa_t *spa, vdev_t *vd)
{
vdev_t *rvd = spa->spa_root_vdev;
+ int c;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
/*
if (vd->vdev_faulted || vd->vdev_degraded ||
!vdev_readable(vd) || !vdev_writeable(vd)) {
- vd->vdev_faulted = vd->vdev_degraded = 0;
+ /*
+ * When reopening in reponse to a clear event, it may be due to
+ * a fmadm repair request. In this case, if the device is
+ * still broken, we want to still post the ereport again.
+ */
+ vd->vdev_forcefault = B_TRUE;
+
+ vd->vdev_faulted = vd->vdev_degraded = 0ULL;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
- vdev_reopen(vd);
+ vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
- if (vd != rvd)
+ vd->vdev_forcefault = B_FALSE;
+
+ if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
}
+
+ /*
+ * When clearing a FMA-diagnosed fault, we always want to
+ * unspare the device, as we assume that the original spare was
+ * done in response to the FMA fault.
+ */
+ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
}
boolean_t
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state < VDEV_STATE_DEGRADED);
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write);
+ !vd->vdev_cant_write && !vd->vdev_ishole);
}
boolean_t
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
{
vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int c, t;
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_rsize(vd);
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
mutex_exit(&vd->vdev_stat_lock);
/*
* over all top-level vdevs (i.e. the direct children of the root).
*/
if (vd == rvd) {
- for (int c = 0; c < rvd->vdev_children; c++) {
+ for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *cvd = rvd->vdev_child[c];
vdev_stat_t *cvs = &cvd->vdev_stat;
mutex_enter(&vd->vdev_stat_lock);
- for (int t = 0; t < ZIO_TYPES; t++) {
+ for (t = 0; t < ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
- vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ cvs->vs_scan_removing = cvd->vdev_removing;
mutex_exit(&vd->vdev_stat_lock);
}
}
mutex_exit(&vd->vdev_stat_lock);
}
+void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
- if (flags & ZIO_FLAG_SCRUB_THREAD)
- vs->vs_scrub_repaired += psize;
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
+ dsl_scan_phys_t *scn_phys =
+ &spa->spa_dsl_pool->dp_scan->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ /* XXX cleanup? */
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
if (flags & ZIO_FLAG_SPECULATIVE)
return;
+ /*
+ * If this is an I/O error that is going to be retried, then ignore the
+ * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
+ * hard errors, when in reality they can happen for any number of
+ * innocuous reasons (bus resets, MPxIO link failure, etc).
+ */
+ if (zio->io_error == EIO &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+ return;
+
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
mutex_enter(&vd->vdev_stat_lock);
- if (type == ZIO_TYPE_READ) {
+ if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
vs->vs_checksum_errors++;
else
vs->vs_read_errors++;
}
- if (type == ZIO_TYPE_WRITE)
+ if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
vs->vs_write_errors++;
mutex_exit(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
- (flags & ZIO_FLAG_SCRUB_THREAD))) {
+ (flags & ZIO_FLAG_SCAN_THREAD) ||
+ spa->spa_claiming)) {
/*
- * This is either a normal write (not a repair), or it's a
- * repair induced by the scrub thread. In the normal case,
- * we commit the DTL change in the same txg as the block
- * was born. In the scrub-induced repair case, we know that
- * scrubs run in first-pass syncing context, so we commit
- * the DTL change in spa->spa_syncing_txg.
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
- if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
- commit_txg = spa->spa_syncing_txg;
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
}
- ASSERT(commit_txg >= spa->spa_syncing_txg);
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return;
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
}
}
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
- int c;
- vdev_stat_t *vs = &vd->vdev_stat;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
- mutex_enter(&vd->vdev_stat_lock);
-
- if (type == POOL_SCRUB_NONE) {
- /*
- * Update completion and end time. Leave everything else alone
- * so we can report what happened during the previous scrub.
- */
- vs->vs_scrub_complete = complete;
- vs->vs_scrub_end = gethrestime_sec();
- } else {
- vs->vs_scrub_type = type;
- vs->vs_scrub_complete = 0;
- vs->vs_scrub_examined = 0;
- vs->vs_scrub_repaired = 0;
- vs->vs_scrub_start = gethrestime_sec();
- vs->vs_scrub_end = 0;
- }
-
- mutex_exit(&vd->vdev_stat_lock);
-}
-
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
- boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
* childrens', thus not accurate enough for us.
*/
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (update_root) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- /*
- * Don't count non-normal (e.g. intent log) space as part of
- * the pool's capacity.
- */
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
- return;
-
+ if (mc == spa_normal_class(spa)) {
mutex_enter(&rvd->vdev_stat_lock);
- rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
+
+ if (mc != NULL) {
+ ASSERT(rvd == vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc,
+ alloc_delta, defer_delta, space_delta, dspace_delta);
+ }
}
/*
vdev_t *rvd = spa->spa_root_vdev;
int c;
+ ASSERT(spa_writeable(spa));
+
/*
- * If this is an aux vdev (as with l2cache devices), then we update the
- * vdev config manually and set the sync flag.
+ * If this is an aux vdev (as with l2cache and spare devices), then we
+ * update the vdev config manually and set the sync flag.
*/
if (vd->vdev_aux != NULL) {
spa_aux_vdev_t *sav = vd->vdev_aux;
sav->sav_sync = B_TRUE;
- VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
- ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0);
+ if (nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+ }
ASSERT(c < naux);
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
- aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
} else {
ASSERT(vd == vd->vdev_top);
- if (!list_link_active(&vd->vdev_config_dirty_node))
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ !vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
}
}
{
spa_t *spa = vd->vdev_spa;
+ ASSERT(spa_writeable(spa));
ASSERT(vd == vd->vdev_top);
/*
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
- if (!list_link_active(&vd->vdev_state_dirty_node))
+ if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
list_insert_head(&spa->spa_state_dirty_list, vd);
}
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0;
int corrupted = 0;
- int c;
vdev_t *child;
+ int c;
if (vd->vdev_children > 0) {
for (c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
/*
* If we are setting the vdev state to anything but an open state, then
- * always close the underlying device. Otherwise, we keep accessible
- * but invalid devices open forever. We don't call vdev_close() itself,
- * because that implies some extra checks (offline, etc) that we don't
- * want here. This is limited to leaf devices, because otherwise
- * closing the device will affect other children.
+ * always close the underlying device unless the device has requested
+ * a delayed close (i.e. we're about to remove or fault the device).
+ * Otherwise, we keep accessible but invalid devices open forever.
+ * We don't call vdev_close() itself, because that implies some extra
+ * checks (offline, etc) that we don't want here. This is limited to
+ * leaf devices, because otherwise closing the device will affect other
+ * children.
*/
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+ vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
+ /*
+ * If we have brought this vdev back into service, we need
+ * to notify fmd so that it can gracefully repair any outstanding
+ * cases due to a missing device. We do this in all cases, even those
+ * that probably don't correlate to a repaired fault. This is sure to
+ * catch all cases, and we let the zfs-retire agent sort it out. If
+ * this is a transient state it's OK, as the retire agent will
+ * double-check the state of the vdev before repairing it.
+ */
+ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_prevstate != state)
+ zfs_post_state_change(spa, vd);
+
if (vd->vdev_removed &&
state == VDEV_STATE_CANT_OPEN &&
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
vd->vdev_state = VDEV_STATE_REMOVED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
} else if (state == VDEV_STATE_REMOVED) {
- /*
- * Indicate to the ZFS DE that this device has been removed, and
- * any recent errors should be ignored.
- */
- zfs_post_remove(spa, vd);
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
- * If we fail to open a vdev during an import, we mark it as
- * "not available", which signifies that it was never there to
- * begin with. Failure to open such a device is not considered
- * an error.
+ * If we fail to open a vdev during an import or recovery, we
+ * mark it as "not available", which signifies that it was
+ * never there to begin with. Failure to open such a device
+ * is not considered an error.
*/
- if (spa->spa_load_state == SPA_LOAD_IMPORT &&
- !spa->spa_import_faulted &&
+ if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER) &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
case VDEV_AUX_BAD_LABEL:
class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
break;
- case VDEV_AUX_IO_FAILURE:
- class = FM_EREPORT_ZFS_IO_FAILURE;
- break;
default:
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
vd->vdev_removed = B_FALSE;
}
- if (!isopen)
- vdev_propagate_state(vd);
+ if (!isopen && vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
}
/*
}
return (B_TRUE);
}
+
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
+ */
+void
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
+{
+ int c;
+
+ ASSERT(nvd->vdev_top->vdev_islog);
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
+
+ for (c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+
+ if (nvd->vdev_ops->vdev_op_leaf) {
+ /*
+ * Restore the persistent vdev state
+ */
+ nvd->vdev_offline = ovd->vdev_offline;
+ nvd->vdev_faulted = ovd->vdev_faulted;
+ nvd->vdev_degraded = ovd->vdev_degraded;
+ nvd->vdev_removed = ovd->vdev_removed;
+ }
+}
+
+/*
+ * Determine if a log device has valid content. If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+ int c;
+
+ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+ !vd->vdev_removed)
+ return (B_TRUE);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vdev_log_state_valid(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+ vdev_t *cvd, *pvd = vd->vdev_parent;
+
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ cvd = pvd->vdev_child[0];
+ if (pvd->vdev_children == 1) {
+ vdev_remove_parent(cvd);
+ cvd->vdev_splitting = B_TRUE;
+ }
+ vdev_propagate_state(cvd);
+}