*
* CDDL HEADER END
*/
+
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
/*
* txg Transaction group in which this label was written
* pool_guid Unique identifier for this pool
* vdev_tree An nvlist describing vdev tree.
+ * features_for_read
+ * An nvlist of the features necessary for reading the MOS.
*
* Each leaf device label also contains the following:
*
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/zio.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
/*
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare, boolean_t isl2cache)
+ vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
- vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare && !isl2cache)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
- == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
if (vd->vdev_path != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
- vd->vdev_path) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
if (vd->vdev_devid != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
- vd->vdev_devid) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
if (vd->vdev_physpath != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
- vd->vdev_physpath) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath);
+
+ if (vd->vdev_fru != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity == 2 &&
- spa_version(spa) >= SPA_VERSION_RAID6));
+ (vd->vdev_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vd->vdev_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
* that only support a single parity device -- older software
* will just ignore it.
*/
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
- vd->vdev_nparity) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
}
if (vd->vdev_wholedisk != -1ULL)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
- vd->vdev_wholedisk) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk);
if (vd->vdev_not_present)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
if (vd->vdev_isspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
-
- if (!isspare && !isl2cache && vd == vd->vdev_top) {
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
- vd->vdev_ms_array) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
- vd->vdev_ms_shift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
- vd->vdev_ashift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
- vd->vdev_asize) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
- vd->vdev_islog) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
+ if (vd->vdev_removing)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing);
}
if (vd->vdev_dtl_smo.smo_object != 0)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
- vd->vdev_dtl_smo.smo_object) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ vd->vdev_dtl_smo.smo_object);
+
+ if (vd->vdev_crtxg)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
if (getstats) {
vdev_stat_t vs;
+ pool_scan_stat_t ps;
+
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
+
+ /* provide either current or previous scan information */
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ fnvlist_add_uint64_array(nv,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+ }
}
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
- int c;
+ int c, idx;
- child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
- KM_SLEEP);
+ ASSERT(!vd->vdev_ishole);
- for (c = 0; c < vd->vdev_children; c++)
- child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare, isl2cache);
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_PUSHPAGE);
+
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
+
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
- VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, vd->vdev_children) == 0);
+ if (idx) {
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, idx);
+ }
- for (c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < idx; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
} else {
+ const char *aux = NULL;
+
if (vd->vdev_offline && !vd->vdev_tmpoffline)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+ if (vd->vdev_resilver_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ vd->vdev_resilver_txg);
if (vd->vdev_faulted)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
if (vd->vdev_removed)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
if (vd->vdev_unspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
+ if (vd->vdev_ishole)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
+
+ switch (vd->vdev_stat.vs_aux) {
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "err_exceeded";
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ aux = "external";
+ break;
+ }
+
+ if (aux != NULL)
+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
+
+ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid);
+ }
}
return (nv);
}
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t c, idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_PUSHPAGE);
+
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole)
+ array[idx++] = c;
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx) == 0);
+ }
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
nvlist_t *
-vdev_label_read_config(vdev_t *vd)
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL;
vdev_phys_t *vp;
zio_t *zio;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ uint64_t best_txg = 0;
+ int error = 0;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
+ int l;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
vp = zio_buf_alloc(sizeof (vdev_phys_t));
- for (int l = 0; l < VDEV_LABELS; l++) {
+retry:
+ for (l = 0; l < VDEV_LABELS; l++) {
+ nvlist_t *label = NULL;
zio = zio_root(spa, NULL, NULL, flags);
if (zio_wait(zio) == 0 &&
nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
- &config, 0) == 0)
- break;
+ &label, 0) == 0) {
+ uint64_t label_txg = 0;
+
+ /*
+ * Auxiliary vdevs won't have txg values in their
+ * labels and newly added vdevs may not have been
+ * completely initialized so just return the
+ * configuration from the first valid label we
+ * encounter.
+ */
+ error = nvlist_lookup_uint64(label,
+ ZPOOL_CONFIG_POOL_TXG, &label_txg);
+ if ((error || label_txg == 0) && !config) {
+ config = label;
+ break;
+ } else if (label_txg <= txg && label_txg > best_txg) {
+ best_txg = label_txg;
+ nvlist_free(config);
+ config = fnvlist_dup(label);
+ }
+ }
- if (config != NULL) {
- nvlist_free(config);
- config = NULL;
+ if (label != NULL) {
+ nvlist_free(label);
+ label = NULL;
}
}
+ if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
zio_buf_free(vp, sizeof (vdev_phys_t));
return (config);
/*
* Read the label, if any, and perform some basic sanity checks.
*/
- if ((label = vdev_label_read_config(vd)) == NULL)
+ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
return (B_FALSE);
(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
case VDEV_LABEL_SPARE:
return (spa_has_spare(spa, device_guid));
+ default:
+ break;
}
}
if (spa_l2cache_exists(device_guid, NULL))
return (B_TRUE);
+ /*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == FREAD)
+ state = POOL_STATE_ACTIVE;
+
/*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
- vdev_boot_header_t *vb;
+ char *pad2;
uberblock_t *ub;
zio_t *zio;
char *buf;
size_t buflen;
int error;
- uint64_t spare_guid, l2cache_guid;
+ uint64_t spare_guid = 0, l2cache_guid = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int c, l;
+ vdev_t *pvd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
if ((error = vdev_label_init(vd->vdev_child[c],
crtxg, reason)) != 0)
return (error);
- if (!vd->vdev_ops->vdev_op_leaf)
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
+ if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
return (0);
/*
* Dead vdevs cannot be initialized.
*/
if (vdev_is_dead(vd))
- return (EIO);
+ return (SET_ERROR(EIO));
/*
* Determine if the vdev is in use.
*/
- if (reason != VDEV_LABEL_REMOVE &&
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
- return (EBUSY);
+ return (SET_ERROR(EBUSY));
/*
* If this is a request to add or replace a spare or l2cache device
vd->vdev_guid += guid_delta;
- for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += guid_delta;
/*
*/
if (reason == VDEV_LABEL_SPARE)
return (0);
- ASSERT(reason == VDEV_LABEL_REPLACE);
+ ASSERT(reason == VDEV_LABEL_REPLACE ||
+ reason == VDEV_LABEL_SPLIT);
}
if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
vd->vdev_guid += guid_delta;
- for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += guid_delta;
/*
* active hot spare (in which case we want to revert the
* labels).
*/
- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
spa_version(spa)) == 0);
/*
* For level 2 ARC devices, add a special label.
*/
- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
spa_version(spa)) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
} else {
- label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+ uint64_t txg = 0ULL;
+
+ if (reason == VDEV_LABEL_SPLIT)
+ txg = spa->spa_uberblock.ub_txg;
+ label = spa_config_generate(spa, vd, txg, B_FALSE);
/*
* Add our creation time. This allows us to detect multiple
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
- error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE);
if (error != 0) {
nvlist_free(label);
zio_buf_free(vp, sizeof (vdev_phys_t));
return (error == EFAULT ? ENAMETOOLONG : EINVAL);
}
- /*
- * Initialize boot block header.
- */
- vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
- bzero(vb, sizeof (vdev_boot_header_t));
- vb->vb_magic = VDEV_BOOT_MAGIC;
- vb->vb_version = VDEV_BOOT_VERSION;
- vb->vb_offset = VDEV_BOOT_OFFSET;
- vb->vb_size = VDEV_BOOT_SIZE;
-
/*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+ bzero(ub, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock;
ub->ub_txg = 0;
+ /* Initialize the 2nd padding area. */
+ pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+ bzero(pad2, VDEV_PAD_SIZE);
+
/*
* Write everything in parallel.
*/
+retry:
zio = zio_root(spa, NULL, NULL, flags);
- for (int l = 0; l < VDEV_LABELS; l++) {
+ for (l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, vp,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
- vdev_label_write(zio, vd, l, vb,
- offsetof(vdev_label_t, vl_boot_header),
- sizeof (vdev_boot_header_t), NULL, NULL, flags);
+ /*
+ * Skip the 1st padding area.
+ * Zero out the 2nd padding area where it might have
+ * left over data from previous filesystem format.
+ */
+ vdev_label_write(zio, vd, l, pad2,
+ offsetof(vdev_label_t, vl_pad2),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
- }
+ vdev_label_write(zio, vd, l, ub,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
nvlist_free(label);
- zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
- zio_buf_free(vb, sizeof (vdev_boot_header_t));
+ zio_buf_free(pad2, VDEV_PAD_SIZE);
+ zio_buf_free(ub, VDEV_UBERBLOCK_RING);
zio_buf_free(vp, sizeof (vdev_phys_t));
/*
* ==========================================================================
*/
-/*
- * For use by zdb and debugging purposes only
- */
-uint64_t ub_max_txg = UINT64_MAX;
-
/*
* Consider the following situation: txg is safely synced to disk. We've
* written the first uberblock for txg + 1, and then we lose power. When we
* come back up, we fail to see the uberblock for txg + 1 because, say,
* it was on a mirrored device and the replica to which we wrote txg + 1
* is now offline. If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a new seconds we'll have two
+ * the missing replica comes back, then for a few seconds we'll have two
* conflicting uberblocks on disk with the same txg. The solution is simple:
* among uberblocks with equal txg, choose the one with the latest timestamp.
*/
return (0);
}
+struct ubl_cbdata {
+ uberblock_t *ubl_ubbest; /* Best uberblock */
+ vdev_t *ubl_vd; /* vdev associated with the above */
+};
+
static void
vdev_uberblock_load_done(zio_t *zio)
{
+ vdev_t *vd = zio->io_vd;
+ spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
uberblock_t *ub = zio->io_data;
- uberblock_t *ubbest = rio->io_private;
+ struct ubl_cbdata *cbp = rio->io_private;
- ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
- if (ub->ub_txg <= ub_max_txg &&
- vdev_uberblock_compare(ub, ubbest) > 0)
- *ubbest = *ub;
+ if (ub->ub_txg <= spa->spa_load_max_txg &&
+ vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+ /*
+ * Keep track of the vdev in which this uberblock
+ * was found. We will use this information later
+ * to obtain the config nvlist associated with
+ * this uberblock.
+ */
+ *cbp->ubl_ubbest = *ub;
+ cbp->ubl_vd = vd;
+ }
mutex_exit(&rio->io_lock);
}
zio_buf_free(zio->io_data, zio->io_size);
}
-void
-vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+ struct ubl_cbdata *cbp)
{
- spa_t *spa = vd->vdev_spa;
- vdev_t *rvd = spa->spa_root_vdev;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
-
- if (vd == rvd) {
- ASSERT(zio == NULL);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- zio = zio_root(spa, NULL, ubbest, flags);
- bzero(ubbest, sizeof (uberblock_t));
- }
-
- ASSERT(zio != NULL);
+ int c, l, n;
- for (int c = 0; c < vd->vdev_children; c++)
- vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
- for (int l = 0; l < VDEV_LABELS; l++) {
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
VDEV_UBERBLOCK_OFFSET(vd, n),
}
}
}
+}
- if (vd == rvd) {
- (void) zio_wait(zio);
- spa_config_exit(spa, SCL_ALL, FTAG);
- }
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+ zio_t *zio;
+ spa_t *spa = rvd->vdev_spa;
+ struct ubl_cbdata cb;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+ ASSERT(ub);
+ ASSERT(config);
+
+ bzero(ub, sizeof (uberblock_t));
+ *config = NULL;
+
+ cb.ubl_ubbest = ub;
+ cb.ubl_vd = NULL;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ zio = zio_root(spa, NULL, &cb, flags);
+ vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+ (void) zio_wait(zio);
+
+ /*
+ * It's possible that the best uberblock was discovered on a label
+ * that has a configuration which was written in a future txg.
+ * Search all labels on this vdev to find the configuration that
+ * matches the txg for our uberblock.
+ */
+ if (cb.ubl_vd != NULL)
+ *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+ spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
uberblock_t *ubbuf;
- int n;
+ int c, l, n;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
if (!vd->vdev_ops->vdev_op_leaf)
bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
*ubbuf = *ub;
- for (int l = 0; l < VDEV_LABELS; l++)
+ for (l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ubbuf,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private,
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
}
+/* Sync the uberblocks to all vdevs in svd[] */
int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{
spa_t *spa = svd[0]->vdev_spa;
zio_t *zio;
uint64_t good_writes = 0;
+ int v;
zio = zio_root(spa, NULL, &good_writes, flags);
- for (int v = 0; v < svdcount; v++)
+ for (v = 0; v < svdcount; v++)
vdev_uberblock_sync(zio, ub, svd[v], flags);
(void) zio_wait(zio);
*/
zio = zio_root(spa, NULL, NULL, flags);
- for (int v = 0; v < svdcount; v++)
+ for (v = 0; v < svdcount; v++)
zio_flush(zio, svd[v]);
(void) zio_wait(zio);
uint64_t *good_writes = zio->io_private;
if (*good_writes == 0)
- zio->io_error = EIO;
+ zio->io_error = SET_ERROR(EIO);
kmem_free(good_writes, sizeof (uint64_t));
}
vdev_phys_t *vp;
char *buf;
size_t buflen;
+ int c;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
if (!vd->vdev_ops->vdev_op_leaf)
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
+ if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE)) {
for (; l < VDEV_LABELS; l += 2) {
vdev_label_write(zio, vd, l, vp,
offsetof(vdev_label_t, vl_vdev_phys),
zio = zio_root(spa, NULL, NULL, flags);
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
- uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
- KM_SLEEP);
- zio_t *vio = zio_null(zio, spa,
+ uint64_t *good_writes;
+ zio_t *vio;
+
+ ASSERT(!vd->vdev_ishole);
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_PUSHPAGE);
+ vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags);
* at any time, you can just call it again, and it will resume its work.
*/
int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
int error;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ /*
+ * Normally, we don't want to try too hard to write every label and
+ * uberblock. If there is a flaky disk, we don't want the rest of the
+ * sync process to block while we retry. But if we can't write a
+ * single label out, we should retry with ZIO_FLAG_TRYHARD before
+ * bailing out and declaring the pool faulted.
+ */
+ if (tryhard)
+ flags |= ZIO_FLAG_TRYHARD;
+
ASSERT(ub->ub_txg <= txg);
/*