]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/vdev.c
Update vdev devid and physpath if changed between imports
[mirror_zfs.git] / module / zfs / vdev.c
index 95a2f5947db196a22bcf2ac606ac8831d95b526b..d6286dc5920bd8294022f25ab5d29a16c000a5b5 100644 (file)
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
+ * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
@@ -40,6 +42,7 @@
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
+#include "zfs_prop.h"
+
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class.  The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added).  See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations.  Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev.  This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL.  If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+static uint_t zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
-int zfs_vdev_default_ms_count = 200;
+static uint_t zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
-int zfs_vdev_min_ms_count = 16;
+static uint_t zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
-int zfs_vdev_ms_count_limit = 1ULL << 17;
+static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
-int zfs_vdev_default_ms_shift = 29;
+static uint_t zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
-int zfs_vdev_max_ms_shift = 34;
+static uint_t zfs_vdev_max_ms_shift = 34;
 
 int vdev_validate_skip = B_FALSE;
 
@@ -83,18 +110,18 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12);
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
-unsigned int zfs_slow_io_events_per_second = 20;
+static unsigned int zfs_slow_io_events_per_second = 20;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
-unsigned int zfs_checksum_events_per_second = 20;
+static unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
-int zfs_scan_ignore_errors = 0;
+static int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
@@ -110,10 +137,17 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
  */
 int zfs_nocacheflush = 0;
 
-uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
-uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+/*
+ * Maximum and minimum ashift values that can be automatically set based on
+ * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
+ * is higher than the maximum value, it is intentionally limited here to not
+ * excessively impact pool space efficiency.  Higher ashift values may still
+ * be forced by vdev logical ashift or by user via ashift property, but won't
+ * be set automatically as a performance optimization.
+ */
+uint_t zfs_vdev_max_auto_ashift = 14;
+uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
-/*PRINTFLIKE2*/
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
@@ -141,7 +175,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
        char state[20];
 
        if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
-               zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
+               zfs_dbgmsg("%*svdev %llu: %s", indent, "",
+                   (u_longlong_t)vd->vdev_id,
                    vd->vdev_ops->vdev_op_type);
                return;
        }
@@ -190,9 +225,11 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
  * Virtual device management.
  */
 
-static vdev_ops_t *vdev_ops_table[] = {
+static vdev_ops_t *const vdev_ops_table[] = {
        &vdev_root_ops,
        &vdev_raidz_ops,
+       &vdev_draid_ops,
+       &vdev_draid_spare_ops,
        &vdev_mirror_ops,
        &vdev_replacing_ops,
        &vdev_spare_ops,
@@ -210,7 +247,7 @@ static vdev_ops_t *vdev_ops_table[] = {
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
-       vdev_ops_t *ops, **opspp;
+       vdev_ops_t *ops, *const *opspp;
 
        for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
                if (strcmp(ops->vdev_op_type, type) == 0)
@@ -219,17 +256,35 @@ vdev_getops(const char *type)
        return (ops);
 }
 
-/* ARGSUSED */
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+       if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+           vd->vdev_log_mg != NULL)
+               return (vd->vdev_log_mg);
+       else
+               return (vd->vdev_mg);
+}
+
 void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
-       res->rs_start = in->rs_start;
-       res->rs_end = in->rs_end;
+       (void) vd, (void) remain_rs;
+
+       physical_rs->rs_start = logical_rs->rs_start;
+       physical_rs->rs_end = logical_rs->rs_end;
 }
 
 /*
  * Derive the enumerated allocation bias from string input.
- * String origin is either the per-vdev zap or zpool(1M).
+ * String origin is either the per-vdev zap or zpool(8).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
@@ -251,19 +306,25 @@ vdev_derive_alloc_bias(const char *bias)
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
        uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
        uint64_t csize;
 
        for (int c = 0; c < vd->vdev_children; c++) {
-               csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+               csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
                asize = MAX(asize, csize);
        }
 
        return (asize);
 }
 
+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+       return (vd->vdev_min_asize);
+}
+
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +350,7 @@ vdev_get_min_asize(vdev_t *vd)
        if (vd == vd->vdev_top)
                return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 
-       /*
-        * The allocatable space for a raidz vdev is N * sizeof(smallest child),
-        * so each child must provide at least 1/Nth of its asize.
-        */
-       if (pvd->vdev_ops == &vdev_raidz_ops)
-               return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
-                   pvd->vdev_children);
-
-       return (pvd->vdev_min_asize);
+       return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
 
 void
@@ -309,6 +362,75 @@ vdev_set_min_asize(vdev_t *vd)
                vdev_set_min_asize(vd->vdev_child[c]);
 }
 
+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+       uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+       if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+               min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+       return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+       uint64_t nparity = 0;
+
+       if (vd->vdev_ops->vdev_op_nparity != NULL)
+               nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+       return (nparity);
+}
+
+static int
+vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
+{
+       spa_t *spa = vd->vdev_spa;
+       objset_t *mos = spa->spa_meta_objset;
+       uint64_t objid;
+       int err;
+
+       if (vd->vdev_root_zap != 0) {
+               objid = vd->vdev_root_zap;
+       } else if (vd->vdev_top_zap != 0) {
+               objid = vd->vdev_top_zap;
+       } else if (vd->vdev_leaf_zap != 0) {
+               objid = vd->vdev_leaf_zap;
+       } else {
+               return (EINVAL);
+       }
+
+       err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
+           sizeof (uint64_t), 1, value);
+
+       if (err == ENOENT)
+               *value = vdev_prop_default_numeric(prop);
+
+       return (err);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+       uint64_t ndisks = 1;
+
+       if (vd->vdev_ops->vdev_op_ndisks != NULL)
+               ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+       return (ndisks);
+}
+
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
@@ -389,7 +511,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 
        newchild = kmem_alloc(newsize, KM_SLEEP);
        if (pvd->vdev_child != NULL) {
-               bcopy(pvd->vdev_child, newchild, oldsize);
+               memcpy(newchild, pvd->vdev_child, oldsize);
                kmem_free(pvd->vdev_child, oldsize);
        }
 
@@ -543,14 +665,25 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
         */
        zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
            1);
+       zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
+           1);
        zfs_ratelimit_init(&vd->vdev_checksum_rl,
            &zfs_checksum_events_per_second, 1);
 
+       /*
+        * Default Thresholds for tuning ZED
+        */
+       vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
+       vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
+       vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
+       vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+
        list_link_init(&vd->vdev_config_dirty_node);
        list_link_init(&vd->vdev_state_dirty_node);
        list_link_init(&vd->vdev_initialize_node);
        list_link_init(&vd->vdev_leaf_node);
        list_link_init(&vd->vdev_trim_node);
+
        mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
        mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -566,12 +699,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
        mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
-       cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
 
        for (int t = 0; t < DTL_TYPES; t++) {
                vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -584,7 +716,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
            offsetof(struct vdev, vdev_dtl_node));
        vd->vdev_stat.vs_timestamp = gethrtime();
        vdev_queue_init(vd);
-       vdev_cache_init(vd);
 
        return (vd);
 }
@@ -599,11 +730,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
        vdev_ops_t *ops;
-       char *type;
-       uint64_t guid = 0, islog, nparity;
+       const char *type;
+       uint64_t guid = 0, islog;
        vdev_t *vd;
        vdev_indirect_config_t *vic;
-       char *tmp = NULL;
+       const char *tmp = NULL;
        int rc;
        vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
        boolean_t top_level = (parent && !parent->vdev_parent);
@@ -657,48 +788,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
        if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
                return (SET_ERROR(ENOTSUP));
 
-       /*
-        * Set the nparity property for RAID-Z vdevs.
-        */
-       nparity = -1ULL;
-       if (ops == &vdev_raidz_ops) {
-               if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-                   &nparity) == 0) {
-                       if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-                               return (SET_ERROR(EINVAL));
-                       /*
-                        * Previous versions could only support 1 or 2 parity
-                        * device.
-                        */
-                       if (nparity > 1 &&
-                           spa_version(spa) < SPA_VERSION_RAIDZ2)
-                               return (SET_ERROR(ENOTSUP));
-                       if (nparity > 2 &&
-                           spa_version(spa) < SPA_VERSION_RAIDZ3)
-                               return (SET_ERROR(ENOTSUP));
-               } else {
-                       /*
-                        * We require the parity to be specified for SPAs that
-                        * support multiple parity levels.
-                        */
-                       if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-                               return (SET_ERROR(EINVAL));
-                       /*
-                        * Otherwise, we default to 1 parity device for RAID-Z.
-                        */
-                       nparity = 1;
-               }
-       } else {
-               nparity = 0;
-       }
-       ASSERT(nparity != -1ULL);
-
-       /*
-        * If creating a top-level vdev, check for allocation classes input
-        */
        if (top_level && alloctype == VDEV_ALLOC_ADD) {
-               char *bias;
+               const char *bias;
 
+               /*
+                * If creating a top-level vdev, check for allocation
+                * classes input.
+                */
                if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
                    &bias) == 0) {
                        alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,18 +806,37 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                                return (SET_ERROR(ENOTSUP));
                        }
                }
+
+               /* spa_vdev_add() expects feature to be enabled */
+               if (ops == &vdev_draid_ops &&
+                   spa->spa_load_state != SPA_LOAD_CREATE &&
+                   !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+                       return (SET_ERROR(ENOTSUP));
+               }
        }
 
-       vd = vdev_alloc_common(spa, id, guid, ops);
-       vic = &vd->vdev_indirect_config;
+       /*
+        * Initialize the vdev specific data.  This is done before calling
+        * vdev_alloc_common() since it may fail and this simplifies the
+        * error reporting and cleanup code paths.
+        */
+       void *tsd = NULL;
+       if (ops->vdev_op_init != NULL) {
+               rc = ops->vdev_op_init(spa, nv, &tsd);
+               if (rc != 0) {
+                       return (rc);
+               }
+       }
 
+       vd = vdev_alloc_common(spa, id, guid, ops);
+       vd->vdev_tsd = tsd;
        vd->vdev_islog = islog;
-       vd->vdev_nparity = nparity;
+
        if (top_level && alloc_bias != VDEV_BIAS_NONE)
                vd->vdev_alloc_bias = alloc_bias;
 
-       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
-               vd->vdev_path = spa_strdup(vd->vdev_path);
+       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
+               vd->vdev_path = spa_strdup(tmp);
 
        /*
         * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
@@ -735,18 +850,17 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
        }
 
-       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
-               vd->vdev_devid = spa_strdup(vd->vdev_devid);
-       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
-           &vd->vdev_physpath) == 0)
-               vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
+               vd->vdev_devid = spa_strdup(tmp);
+       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
+               vd->vdev_physpath = spa_strdup(tmp);
 
        if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
-           &vd->vdev_enc_sysfs_path) == 0)
-               vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
+           &tmp) == 0)
+               vd->vdev_enc_sysfs_path = spa_strdup(tmp);
 
-       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
-               vd->vdev_fru = spa_strdup(vd->vdev_fru);
+       if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
+               vd->vdev_fru = spa_strdup(tmp);
 
        /*
         * Set the whole_disk property.  If it's not specified, leave the value
@@ -756,6 +870,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
            &vd->vdev_wholedisk) != 0)
                vd->vdev_wholedisk = -1ULL;
 
+       vic = &vd->vdev_indirect_config;
+
        ASSERT0(vic->vic_mapping_object);
        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
            &vic->vic_mapping_object);
@@ -774,9 +890,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
            &vd->vdev_not_present);
 
        /*
-        * Get the alignment requirement.
+        * Get the alignment requirement. Ignore pool ashift for vdev
+        * attach case.
         */
-       (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+       if (alloctype != VDEV_ALLOC_ATTACH) {
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+                   &vd->vdev_ashift);
+       } else {
+               vd->vdev_attaching = B_TRUE;
+       }
 
        /*
         * Retrieve the vdev creation time.
@@ -784,6 +906,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
        (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
            &vd->vdev_crtxg);
 
+       if (vd->vdev_ops == &vdev_root_ops &&
+           (alloctype == VDEV_ALLOC_LOAD ||
+           alloctype == VDEV_ALLOC_SPLIT ||
+           alloctype == VDEV_ALLOC_ROOTPOOL)) {
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
+                   &vd->vdev_root_zap);
+       }
+
        /*
         * If we're a top-level vdev, try to load the allocation parameters.
         */
@@ -795,10 +925,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                    &vd->vdev_ms_shift);
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
                    &vd->vdev_asize);
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
+                   &vd->vdev_noalloc);
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
                    &vd->vdev_removing);
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
                    &vd->vdev_top_zap);
+               vd->vdev_rz_expanding = nvlist_exists(nv,
+                   ZPOOL_CONFIG_RAIDZ_EXPANDING);
        } else {
                ASSERT0(vd->vdev_top_zap);
        }
@@ -873,7 +1007,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                            &vd->vdev_removed);
 
                        if (vd->vdev_faulted || vd->vdev_degraded) {
-                               char *aux;
+                               const char *aux;
 
                                vd->vdev_label_aux =
                                    VDEV_AUX_ERR_EXCEEDED;
@@ -937,6 +1071,9 @@ vdev_free(vdev_t *vd)
        ASSERT(vd->vdev_child == NULL);
        ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
+       if (vd->vdev_ops->vdev_op_fini != NULL)
+               vd->vdev_ops->vdev_op_fini(vd);
+
        /*
         * Discard allocation state.
         */
@@ -945,6 +1082,11 @@ vdev_free(vdev_t *vd)
                metaslab_group_destroy(vd->vdev_mg);
                vd->vdev_mg = NULL;
        }
+       if (vd->vdev_log_mg != NULL) {
+               ASSERT0(vd->vdev_ms_count);
+               metaslab_group_destroy(vd->vdev_log_mg);
+               vd->vdev_log_mg = NULL;
+       }
 
        ASSERT0(vd->vdev_stat.vs_space);
        ASSERT0(vd->vdev_stat.vs_dspace);
@@ -962,7 +1104,6 @@ vdev_free(vdev_t *vd)
         * Clean up vdev structure.
         */
        vdev_queue_fini(vd);
-       vdev_cache_fini(vd);
 
        if (vd->vdev_path)
                spa_strfree(vd->vdev_path);
@@ -1025,14 +1166,14 @@ vdev_free(vdev_t *vd)
        mutex_destroy(&vd->vdev_trim_io_lock);
        cv_destroy(&vd->vdev_trim_cv);
        cv_destroy(&vd->vdev_autotrim_cv);
+       cv_destroy(&vd->vdev_autotrim_kick_cv);
        cv_destroy(&vd->vdev_trim_io_cv);
 
        mutex_destroy(&vd->vdev_rebuild_lock);
-       mutex_destroy(&vd->vdev_rebuild_io_lock);
        cv_destroy(&vd->vdev_rebuild_cv);
-       cv_destroy(&vd->vdev_rebuild_io_cv);
 
        zfs_ratelimit_fini(&vd->vdev_delay_rl);
+       zfs_ratelimit_fini(&vd->vdev_deadman_rl);
        zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
        if (vd == spa->spa_root_vdev)
@@ -1054,7 +1195,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
        ASSERT(tvd == tvd->vdev_top);
 
-       tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
        tvd->vdev_ms_array = svd->vdev_ms_array;
        tvd->vdev_ms_shift = svd->vdev_ms_shift;
        tvd->vdev_ms_count = svd->vdev_ms_count;
@@ -1067,14 +1207,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
        if (tvd->vdev_mg)
                ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+       if (tvd->vdev_log_mg)
+               ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
        tvd->vdev_mg = svd->vdev_mg;
+       tvd->vdev_log_mg = svd->vdev_log_mg;
        tvd->vdev_ms = svd->vdev_ms;
 
        svd->vdev_mg = NULL;
+       svd->vdev_log_mg = NULL;
        svd->vdev_ms = NULL;
 
        if (tvd->vdev_mg != NULL)
                tvd->vdev_mg->mg_vd = tvd;
+       if (tvd->vdev_log_mg != NULL)
+               tvd->vdev_log_mg->mg_vd = tvd;
 
        tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
        svd->vdev_checkpoint_sm = NULL;
@@ -1100,8 +1246,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
        ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
        ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
        ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+       ASSERT0(tvd->vdev_noalloc);
        ASSERT0(tvd->vdev_removing);
        ASSERT0(tvd->vdev_rebuilding);
+       tvd->vdev_noalloc = svd->vdev_noalloc;
        tvd->vdev_removing = svd->vdev_removing;
        tvd->vdev_rebuilding = svd->vdev_rebuilding;
        tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
@@ -1117,6 +1265,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
        svd->vdev_indirect_mapping = NULL;
        svd->vdev_indirect_births = NULL;
        svd->vdev_obsolete_sm = NULL;
+       svd->vdev_noalloc = 0;
        svd->vdev_removing = 0;
        svd->vdev_rebuilding = 0;
 
@@ -1161,7 +1310,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
 }
 
 /*
- * Add a mirror/replacing vdev above an existing vdev.
+ * Add a mirror/replacing vdev above an existing vdev.  There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1251,7 +1401,37 @@ vdev_remove_parent(vdev_t *cvd)
        vdev_free(mvd);
 }
 
+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+       while (b != 0) {
+               uint64_t t = b;
+               b = a % b;
+               a = t;
+       }
+       return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
 static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+       if (min_alloc < spa->spa_min_alloc)
+               spa->spa_min_alloc = min_alloc;
+       if (spa->spa_gcd_alloc == INT_MAX) {
+               spa->spa_gcd_alloc = min_alloc;
+       } else {
+               spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+                   spa->spa_gcd_alloc);
+       }
+}
+
+void
 vdev_metaslab_group_create(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
@@ -1285,10 +1465,15 @@ vdev_metaslab_group_create(vdev_t *vd)
                vd->vdev_mg = metaslab_group_create(mc, vd,
                    spa->spa_alloc_count);
 
+               if (!vd->vdev_islog) {
+                       vd->vdev_log_mg = metaslab_group_create(
+                           spa_embedded_log_class(spa), vd, 1);
+               }
+
                /*
-                * The spa ashift values currently only reflect the
-                * general vdev classes. Class destination is late
-                * binding so ashift checking had to wait until now
+                * The spa ashift min/max only apply for the normal metaslab
+                * class. Class destination is late binding so ashift boundary
+                * setting had to wait until now.
                 */
                if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
                    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
@@ -1296,6 +1481,9 @@ vdev_metaslab_group_create(vdev_t *vd)
                                spa->spa_max_ashift = vd->vdev_ashift;
                        if (vd->vdev_ashift < spa->spa_min_ashift)
                                spa->spa_min_ashift = vd->vdev_ashift;
+
+                       uint64_t min_alloc = vdev_get_min_alloc(vd);
+                       vdev_spa_set_alloc(spa, min_alloc);
                }
        }
 }
@@ -1304,8 +1492,6 @@ int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
        spa_t *spa = vd->vdev_spa;
-       objset_t *mos = spa->spa_meta_objset;
-       uint64_t m;
        uint64_t oldc = vd->vdev_ms_count;
        uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
        metaslab_t **mspp;
@@ -1327,22 +1513,23 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
        mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
        if (expanding) {
-               bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+               memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
                vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
        }
 
        vd->vdev_ms = mspp;
        vd->vdev_ms_count = newc;
-       for (m = oldc; m < newc; m++) {
-               uint64_t object = 0;
 
+       for (uint64_t m = oldc; m < newc; m++) {
+               uint64_t object = 0;
                /*
                 * vdev_ms_array may be 0 if we are creating the "fake"
                 * metaslabs for an indirect vdev for zdb's leak detection.
                 * See zdb_leak_init().
                 */
                if (txg == 0 && vd->vdev_ms_array != 0) {
-                       error = dmu_read(mos, vd->vdev_ms_array,
+                       error = dmu_read(spa->spa_meta_objset,
+                           vd->vdev_ms_array,
                            m * sizeof (uint64_t), sizeof (uint64_t), &object,
                            DMU_READ_PREFETCH);
                        if (error != 0) {
@@ -1352,17 +1539,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                        }
                }
 
-#ifndef _KERNEL
-               /*
-                * To accommodate zdb_leak_init() fake indirect
-                * metaslabs, we allocate a metaslab group for
-                * indirect vdevs which normally don't have one.
-                */
-               if (vd->vdev_mg == NULL) {
-                       ASSERT0(vdev_is_concrete(vd));
-                       vdev_metaslab_group_create(vd);
-               }
-#endif
                error = metaslab_init(vd->vdev_mg, m, object, txg,
                    &(vd->vdev_ms[m]));
                if (error != 0) {
@@ -1372,28 +1548,68 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
                }
        }
 
+       /*
+        * Find the emptiest metaslab on the vdev and mark it for use for
+        * embedded slog by moving it from the regular to the log metaslab
+        * group.
+        */
+       if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+           vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+           avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+               uint64_t slog_msid = 0;
+               uint64_t smallest = UINT64_MAX;
+
+               /*
+                * Note, we only search the new metaslabs, because the old
+                * (pre-existing) ones may be active (e.g. have non-empty
+                * range_tree's), and we don't move them to the new
+                * metaslab_t.
+                */
+               for (uint64_t m = oldc; m < newc; m++) {
+                       uint64_t alloc =
+                           space_map_allocated(vd->vdev_ms[m]->ms_sm);
+                       if (alloc < smallest) {
+                               slog_msid = m;
+                               smallest = alloc;
+                       }
+               }
+               metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+               /*
+                * The metaslab was marked as dirty at the end of
+                * metaslab_init(). Remove it from the dirty list so that we
+                * can uninitialize and reinitialize it to the new class.
+                */
+               if (txg != 0) {
+                       (void) txg_list_remove_this(&vd->vdev_ms_list,
+                           slog_ms, txg);
+               }
+               uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+               metaslab_fini(slog_ms);
+               VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+                   &vd->vdev_ms[slog_msid]));
+       }
+
        if (txg == 0)
                spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
        /*
-        * If the vdev is being removed we don't activate
-        * the metaslabs since we want to ensure that no new
-        * allocations are performed on this device.
+        * If the vdev is marked as non-allocating then don't
+        * activate the metaslabs since we want to ensure that
+        * no allocations are performed on this device.
         */
-       if (!expanding && !vd->vdev_removing) {
+       if (vd->vdev_noalloc) {
+               /* track non-allocating vdev space */
+               spa->spa_nonallocating_dspace += spa_deflate(spa) ?
+                   vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+       } else if (!expanding) {
                metaslab_group_activate(vd->vdev_mg);
+               if (vd->vdev_log_mg != NULL)
+                       metaslab_group_activate(vd->vdev_log_mg);
        }
 
        if (txg == 0)
                spa_config_exit(spa, SCL_ALLOC, FTAG);
 
-       /*
-        * Regardless whether this vdev was just added or it is being
-        * expanded, the metaslab count has changed. Recalculate the
-        * block limit.
-        */
-       spa_log_sm_set_blocklimit(spa);
-
        return (0);
 }
 
@@ -1417,7 +1633,12 @@ vdev_metaslab_fini(vdev_t *vd)
 
        if (vd->vdev_ms != NULL) {
                metaslab_group_t *mg = vd->vdev_mg;
+
                metaslab_group_passivate(mg);
+               if (vd->vdev_log_mg != NULL) {
+                       ASSERT(!vd->vdev_islog);
+                       metaslab_group_passivate(vd->vdev_log_mg);
+               }
 
                uint64_t count = vd->vdev_ms_count;
                for (uint64_t m = 0; m < count; m++) {
@@ -1427,14 +1648,15 @@ vdev_metaslab_fini(vdev_t *vd)
                }
                vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
                vd->vdev_ms = NULL;
-
                vd->vdev_ms_count = 0;
 
-               for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+               for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
                        ASSERT0(mg->mg_histogram[i]);
+                       if (vd->vdev_log_mg != NULL)
+                               ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+               }
        }
        ASSERT0(vd->vdev_ms_count);
-       ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }
 
 typedef struct vdev_probe_stats {
@@ -1473,6 +1695,8 @@ vdev_probe_done(zio_t *zio)
 
                vd->vdev_cant_read |= !vps->vps_readable;
                vd->vdev_cant_write |= !vps->vps_writeable;
+               vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+                   vd->vdev_cant_read, vd->vdev_cant_write);
 
                if (vdev_readable(vd) &&
                    (vdev_writeable(vd) || !spa_writeable(spa))) {
@@ -1481,7 +1705,7 @@ vdev_probe_done(zio_t *zio)
                        ASSERT(zio->io_error != 0);
                        vdev_dbgmsg(vd, "failed probe");
                        (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
-                           spa, vd, NULL, NULL, 0, 0);
+                           spa, vd, NULL, NULL, 0);
                        zio->io_error = SET_ERROR(ENXIO);
                }
 
@@ -1532,8 +1756,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
                vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
                vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
-                   ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
-                   ZIO_FLAG_TRYHARD;
+                   ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
 
                if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
                        /*
@@ -1597,6 +1820,14 @@ vdev_probe(vdev_t *vd, zio_t *zio)
        return (NULL);
 }
 
+static void
+vdev_load_child(void *arg)
+{
+       vdev_t *vd = arg;
+
+       vd->vdev_load_error = vdev_load(vd);
+}
+
 static void
 vdev_open_child(void *arg)
 {
@@ -1622,53 +1853,136 @@ vdev_uses_zvols(vdev_t *vd)
        return (B_FALSE);
 }
 
-void
-vdev_open_children(vdev_t *vd)
+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+       (void) vd;
+       return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs.  If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread.  This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
-       taskq_t *tq;
        int children = vd->vdev_children;
 
-       /*
-        * in order to handle pools on top of zvols, do the opens
-        * in a single thread so that the same thread holds the
-        * spa_namespace_lock
-        */
-       if (vdev_uses_zvols(vd)) {
-retry_sync:
-               for (int c = 0; c < children; c++)
-                       vd->vdev_child[c]->vdev_open_error =
-                           vdev_open(vd->vdev_child[c]);
-       } else {
-               tq = taskq_create("vdev_open", children, minclsyspri,
-                   children, children, TASKQ_PREPOPULATE);
-               if (tq == NULL)
-                       goto retry_sync;
+       taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+           children, children, TASKQ_PREPOPULATE);
+       vd->vdev_nonrot = B_TRUE;
 
-               for (int c = 0; c < children; c++)
+       for (int c = 0; c < children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+
+               if (open_func(cvd) == B_FALSE)
+                       continue;
+
+               if (tq == NULL || vdev_uses_zvols(vd)) {
+                       cvd->vdev_open_error = vdev_open(cvd);
+               } else {
                        VERIFY(taskq_dispatch(tq, vdev_open_child,
-                           vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+                           cvd, TQ_SLEEP) != TASKQID_INVALID);
+               }
+
+               vd->vdev_nonrot &= cvd->vdev_nonrot;
+       }
 
+       if (tq != NULL) {
+               taskq_wait(tq);
                taskq_destroy(tq);
        }
+}
 
-       vd->vdev_nonrot = B_TRUE;
+/*
+ * Open all child vdevs.
+ */
+void
+vdev_open_children(vdev_t *vd)
+{
+       vdev_open_children_impl(vd, vdev_default_open_children_func);
+}
 
-       for (int c = 0; c < children; c++)
-               vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+       vdev_open_children_impl(vd, open_func);
 }
 
 /*
- * Compute the raidz-deflation ratio.  Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's.  We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
        if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
                vd->vdev_deflate_ratio = (1 << 17) /
-                   (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+                   (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+                   SPA_MINBLOCKSHIFT);
+       }
+}
+
+/*
+ * Choose the best of two ashifts, preferring one between logical ashift
+ * (absolute minimum) and administrator defined maximum, otherwise take
+ * the biggest of the two.
+ */
+uint64_t
+vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
+{
+       if (a > logical && a <= zfs_vdev_max_auto_ashift) {
+               if (b <= logical || b > zfs_vdev_max_auto_ashift)
+                       return (a);
+               else
+                       return (MAX(a, b));
+       } else if (b <= logical || b > zfs_vdev_max_auto_ashift)
+               return (MAX(a, b));
+       return (b);
+}
+
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+static void
+vdev_ashift_optimize(vdev_t *vd)
+{
+       ASSERT(vd == vd->vdev_top);
+
+       if (vd->vdev_ashift < vd->vdev_physical_ashift &&
+           vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
+               vd->vdev_ashift = MIN(
+                   MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
+                   MAX(zfs_vdev_min_auto_ashift,
+                   vd->vdev_physical_ashift));
+       } else {
+               /*
+                * If the logical and physical ashifts are the same, then
+                * we ensure that the top-level vdev's ashift is not smaller
+                * than our minimum ashift value. For the unusual case
+                * where logical ashift > physical ashift, we can't cap
+                * the calculated ashift based on max ashift as that
+                * would cause failures.
+                * We still check if we need to increase it to match
+                * the min ashift.
+                */
+               vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
+                   vd->vdev_ashift);
        }
 }
 
@@ -1716,6 +2030,14 @@ vdev_open(vdev_t *vd)
 
        error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
            &logical_ashift, &physical_ashift);
+
+       /* Keep the device in removed state if unplugged */
+       if (error == ENOENT && vd->vdev_removed) {
+               vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
+                   VDEV_AUX_NONE);
+               return (error);
+       }
+
        /*
         * Physical volume size should never be larger than its max size, unless
         * the disk has shrunk while we were reading it or the device is buggy
@@ -1830,16 +2152,17 @@ vdev_open(vdev_t *vd)
                return (SET_ERROR(EINVAL));
        }
 
+       /*
+        * We can always set the logical/physical ashift members since
+        * their values are only used to calculate the vdev_ashift when
+        * the device is first added to the config. These values should
+        * not be used for anything else since they may change whenever
+        * the device is reopened and we don't store them in the label.
+        */
        vd->vdev_physical_ashift =
            MAX(physical_ashift, vd->vdev_physical_ashift);
-       vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
-       vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
-
-       if (vd->vdev_logical_ashift > ASHIFT_MAX) {
-               vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_ASHIFT_TOO_BIG);
-               return (SET_ERROR(EDOM));
-       }
+       vd->vdev_logical_ashift = MAX(logical_ashift,
+           vd->vdev_logical_ashift);
 
        if (vd->vdev_asize == 0) {
                /*
@@ -1848,6 +2171,24 @@ vdev_open(vdev_t *vd)
                 */
                vd->vdev_asize = asize;
                vd->vdev_max_asize = max_asize;
+
+               /*
+                * If the vdev_ashift was not overridden at creation time,
+                * then set it the logical ashift and optimize the ashift.
+                */
+               if (vd->vdev_ashift == 0) {
+                       vd->vdev_ashift = vd->vdev_logical_ashift;
+
+                       if (vd->vdev_logical_ashift > ASHIFT_MAX) {
+                               vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+                                   VDEV_AUX_ASHIFT_TOO_BIG);
+                               return (SET_ERROR(EDOM));
+                       }
+
+                       if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
+                               vdev_ashift_optimize(vd);
+                       vd->vdev_attaching = B_FALSE;
+               }
                if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
                    vd->vdev_ashift > ASHIFT_MAX)) {
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1862,11 +2203,10 @@ vdev_open(vdev_t *vd)
                    vd->vdev_ops->vdev_op_leaf) {
                        (void) zfs_ereport_post(
                            FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
-                           spa, vd, NULL, NULL, 0, 0);
+                           spa, vd, NULL, NULL, 0);
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
                            VDEV_AUX_BAD_LABEL);
                        return (SET_ERROR(EDOM));
-
                }
                vd->vdev_max_asize = max_asize;
        }
@@ -1903,15 +2243,12 @@ vdev_open(vdev_t *vd)
        }
 
        /*
-        * Track the min and max ashift values for normal data devices.
+        * Track the minimum allocation size.
         */
        if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
-           vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
            vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
-               if (vd->vdev_ashift > spa->spa_max_ashift)
-                       spa->spa_max_ashift = vd->vdev_ashift;
-               if (vd->vdev_ashift < spa->spa_min_ashift)
-                       spa->spa_min_ashift = vd->vdev_ashift;
+               uint64_t min_alloc = vdev_get_min_alloc(vd);
+               vdev_spa_set_alloc(spa, min_alloc);
        }
 
        /*
@@ -1925,6 +2262,16 @@ vdev_open(vdev_t *vd)
        return (0);
 }
 
+static void
+vdev_validate_child(void *arg)
+{
+       vdev_t *vd = arg;
+
+       vd->vdev_validate_thread = curthread;
+       vd->vdev_validate_error = vdev_validate(vd);
+       vd->vdev_validate_thread = NULL;
+}
+
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
@@ -1939,18 +2286,43 @@ int
 vdev_validate(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
+       taskq_t *tq = NULL;
        nvlist_t *label;
        uint64_t guid = 0, aux_guid = 0, top_guid;
        uint64_t state;
        nvlist_t *nvl;
        uint64_t txg;
+       int children = vd->vdev_children;
 
        if (vdev_validate_skip)
                return (0);
 
-       for (uint64_t c = 0; c < vd->vdev_children; c++)
-               if (vdev_validate(vd->vdev_child[c]) != 0)
+       if (children > 0) {
+               tq = taskq_create("vdev_validate", children, minclsyspri,
+                   children, children, TASKQ_PREPOPULATE);
+       }
+
+       for (uint64_t c = 0; c < children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+
+               if (tq == NULL || vdev_uses_zvols(cvd)) {
+                       vdev_validate_child(cvd);
+               } else {
+                       VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
+                           TQ_SLEEP) != TASKQID_INVALID);
+               }
+       }
+       if (tq != NULL) {
+               taskq_wait(tq);
+               taskq_destroy(tq);
+       }
+       for (int c = 0; c < children; c++) {
+               int error = vd->vdev_child[c]->vdev_validate_error;
+
+               if (error != 0)
                        return (SET_ERROR(EBADF));
+       }
+
 
        /*
         * If the device has already failed, or was marked offline, don't do
@@ -1973,7 +2345,7 @@ vdev_validate(vdev_t *vd)
                txg = spa_last_synced_txg(spa);
 
        if ((label = vdev_label_read_config(vd, txg)) == NULL) {
-               vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+               vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
                    VDEV_AUX_BAD_LABEL);
                vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
                    "txg %llu", (u_longlong_t)txg);
@@ -2121,26 +2493,64 @@ vdev_validate(vdev_t *vd)
 }
 
 static void
-vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
 {
-       if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
-               if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
-                       zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
-                           "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
-                           dvd->vdev_path, svd->vdev_path);
-                       spa_strfree(dvd->vdev_path);
-                       dvd->vdev_path = spa_strdup(svd->vdev_path);
+       if (svd != NULL && *dvd != NULL) {
+               if (strcmp(svd, *dvd) != 0) {
+                       zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
+                           "from '%s' to '%s'", (u_longlong_t)guid, prefix,
+                           *dvd, svd);
+                       spa_strfree(*dvd);
+                       *dvd = spa_strdup(svd);
                }
-       } else if (svd->vdev_path != NULL) {
-               dvd->vdev_path = spa_strdup(svd->vdev_path);
+       } else if (svd != NULL) {
+               *dvd = spa_strdup(svd);
                zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
-                   (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+                   (u_longlong_t)guid, *dvd);
        }
 }
 
-/*
- * Recursively copy vdev paths from one vdev to another. Source and destination
- * vdev trees must have same geometry otherwise return error. Intended to copy
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+       char *old, *new;
+
+       vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
+           dvd->vdev_guid);
+
+       vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
+           dvd->vdev_guid);
+
+       vdev_update_path("vdev_physpath", svd->vdev_physpath,
+           &dvd->vdev_physpath, dvd->vdev_guid);
+
+       /*
+        * Our enclosure sysfs path may have changed between imports
+        */
+       old = dvd->vdev_enc_sysfs_path;
+       new = svd->vdev_enc_sysfs_path;
+       if ((old != NULL && new == NULL) ||
+           (old == NULL && new != NULL) ||
+           ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
+               zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
+                   "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
+                   old, new);
+
+               if (dvd->vdev_enc_sysfs_path)
+                       spa_strfree(dvd->vdev_enc_sysfs_path);
+
+               if (svd->vdev_enc_sysfs_path) {
+                       dvd->vdev_enc_sysfs_path = spa_strdup(
+                           svd->vdev_enc_sysfs_path);
+               } else {
+                       dvd->vdev_enc_sysfs_path = NULL;
+               }
+       }
+}
+
+/*
+ * Recursively copy vdev paths from one vdev to another. Source and destination
+ * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
@@ -2240,7 +2650,9 @@ vdev_close(vdev_t *vd)
        vdev_t *pvd = vd->vdev_parent;
        spa_t *spa __maybe_unused = vd->vdev_spa;
 
-       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+       ASSERT(vd != NULL);
+       ASSERT(vd->vdev_open_thread == curthread ||
+           spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
        /*
         * If our parent is reopening, then we are as well, unless we are
@@ -2251,8 +2663,6 @@ vdev_close(vdev_t *vd)
 
        vd->vdev_ops->vdev_op_close(vd);
 
-       vdev_cache_purge(vd);
-
        /*
         * We record the previous state before we close it, so that if we are
         * doing a reopen(), we don't generate FMA ereports if we notice that
@@ -2279,7 +2689,7 @@ vdev_hold(vdev_t *vd)
        for (int c = 0; c < vd->vdev_children; c++)
                vdev_hold(vd->vdev_child[c]);
 
-       if (vd->vdev_ops->vdev_op_leaf)
+       if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
                vd->vdev_ops->vdev_op_hold(vd);
 }
 
@@ -2290,7 +2700,7 @@ vdev_rele(vdev_t *vd)
        for (int c = 0; c < vd->vdev_children; c++)
                vdev_rele(vd->vdev_child[c]);
 
-       if (vd->vdev_ops->vdev_op_leaf)
+       if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
                vd->vdev_ops->vdev_op_rele(vd);
 }
 
@@ -2338,6 +2748,17 @@ vdev_reopen(vdev_t *vd)
                (void) vdev_validate(vd);
        }
 
+       /*
+        * Recheck if resilver is still needed and cancel any
+        * scheduled resilver if resilver is unneeded.
+        */
+       if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
+           spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
+               mutex_enter(&spa->spa_async_lock);
+               spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
+               mutex_exit(&spa->spa_async_lock);
+       }
+
        /*
         * Reassess parent vdev's health.
         */
@@ -2445,35 +2866,6 @@ vdev_metaslab_set_size(vdev_t *vd)
        ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
-/*
- * Maximize performance by inflating the configured ashift for top level
- * vdevs to be as close to the physical ashift as possible while maintaining
- * administrator defined limits and ensuring it doesn't go below the
- * logical ashift.
- */
-void
-vdev_ashift_optimize(vdev_t *vd)
-{
-       if (vd == vd->vdev_top) {
-               if (vd->vdev_ashift < vd->vdev_physical_ashift) {
-                       vd->vdev_ashift = MIN(
-                           MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
-                           MAX(zfs_vdev_min_auto_ashift,
-                           vd->vdev_physical_ashift));
-               } else {
-                       /*
-                        * Unusual case where logical ashift > physical ashift
-                        * so we can't cap the calculated ashift based on max
-                        * ashift as that would cause failures.
-                        * We still check if we need to increase it to match
-                        * the min ashift.
-                        */
-                       vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
-                           vd->vdev_ashift);
-               }
-       }
-}
-
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
@@ -2566,15 +2958,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 
        /*
         * While we are loading the pool, the DTLs have not been loaded yet.
-        * Ignore the DTLs and try all devices.  This avoids a recursive
-        * mutex enter on the vdev_dtl_lock, and also makes us try hard
-        * when loading the pool (relying on the checksum to ensure that
-        * we get the right data -- note that we while loading, we are
-        * only reading the MOS, which is always checksummed).
+        * This isn't a problem but it can result in devices being tried
+        * which are known to not have the data.  In which case, the import
+        * is relying on the checksum to ensure that we get the right data.
+        * Note that while importing we are only reading the MOS, which is
+        * always checksummed.
         */
-       if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
-               return (B_FALSE);
-
        mutex_enter(&vd->vdev_dtl_lock);
        if (!range_tree_is_empty(rt))
                dirty = range_tree_contains(rt, txg, size);
@@ -2597,10 +2986,28 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 }
 
 /*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ * Check if the txg falls within the range which must be
+ * resilvered.  DVAs outside this range can always be skipped.
  */
 boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+       (void) dva, (void) psize;
+
+       /* Set by sequential resilver. */
+       if (phys_birth == TXG_UNKNOWN)
+               return (B_TRUE);
+
+       return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
 {
        ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
@@ -2608,7 +3015,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
            vd->vdev_ops->vdev_op_leaf)
                return (B_TRUE);
 
-       return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+       return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+           phys_birth));
 }
 
 /*
@@ -2842,32 +3250,71 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
 
                if (txg != 0)
                        vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-               return;
+       } else {
+               mutex_enter(&vd->vdev_dtl_lock);
+               for (int t = 0; t < DTL_TYPES; t++) {
+                       /* account for child's outage in parent's missing map */
+                       int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+                       if (t == DTL_SCRUB) {
+                               /* leaf vdevs only */
+                               continue;
+                       }
+                       if (t == DTL_PARTIAL) {
+                               /* i.e. non-zero */
+                               minref = 1;
+                       } else if (vdev_get_nparity(vd) != 0) {
+                               /* RAIDZ, DRAID */
+                               minref = vdev_get_nparity(vd) + 1;
+                       } else {
+                               /* any kind of mirror */
+                               minref = vd->vdev_children;
+                       }
+                       space_reftree_create(&reftree);
+                       for (int c = 0; c < vd->vdev_children; c++) {
+                               vdev_t *cvd = vd->vdev_child[c];
+                               mutex_enter(&cvd->vdev_dtl_lock);
+                               space_reftree_add_map(&reftree,
+                                   cvd->vdev_dtl[s], 1);
+                               mutex_exit(&cvd->vdev_dtl_lock);
+                       }
+                       space_reftree_generate_map(&reftree,
+                           vd->vdev_dtl[t], minref);
+                       space_reftree_destroy(&reftree);
+               }
+               mutex_exit(&vd->vdev_dtl_lock);
        }
 
-       mutex_enter(&vd->vdev_dtl_lock);
-       for (int t = 0; t < DTL_TYPES; t++) {
-               /* account for child's outage in parent's missing map */
-               int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
-               if (t == DTL_SCRUB)
-                       continue;                       /* leaf vdevs only */
-               if (t == DTL_PARTIAL)
-                       minref = 1;                     /* i.e. non-zero */
-               else if (vd->vdev_nparity != 0)
-                       minref = vd->vdev_nparity + 1;  /* RAID-Z */
-               else
-                       minref = vd->vdev_children;     /* any kind of mirror */
-               space_reftree_create(&reftree);
-               for (int c = 0; c < vd->vdev_children; c++) {
-                       vdev_t *cvd = vd->vdev_child[c];
-                       mutex_enter(&cvd->vdev_dtl_lock);
-                       space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
-                       mutex_exit(&cvd->vdev_dtl_lock);
-               }
-               space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
-               space_reftree_destroy(&reftree);
+       if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+               raidz_dtl_reassessed(vd);
        }
-       mutex_exit(&vd->vdev_dtl_lock);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and post kobj events
+ */
+void
+vdev_post_kobj_evt(vdev_t *vd)
+{
+       if (vd->vdev_ops->vdev_op_kobj_evt_post &&
+           vd->vdev_kobj_flag == B_FALSE) {
+               vd->vdev_kobj_flag = B_TRUE;
+               vd->vdev_ops->vdev_op_kobj_evt_post(vd);
+       }
+
+       for (int c = 0; c < vd->vdev_children; c++)
+               vdev_post_kobj_evt(vd->vdev_child[c]);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and clear kobj events
+ */
+void
+vdev_clear_kobj_evt(vdev_t *vd)
+{
+       vd->vdev_kobj_flag = B_FALSE;
+
+       for (int c = 0; c < vd->vdev_children; c++)
+               vdev_clear_kobj_evt(vd->vdev_child[c]);
 }
 
 int
@@ -2875,21 +3322,35 @@ vdev_dtl_load(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
        objset_t *mos = spa->spa_meta_objset;
+       range_tree_t *rt;
        int error = 0;
 
        if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
                ASSERT(vdev_is_concrete(vd));
 
+               /*
+                * If the dtl cannot be sync'd there is no need to open it.
+                */
+               if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
+                       return (0);
+
                error = space_map_open(&vd->vdev_dtl_sm, mos,
                    vd->vdev_dtl_object, 0, -1ULL, 0);
                if (error)
                        return (error);
                ASSERT(vd->vdev_dtl_sm != NULL);
 
-               mutex_enter(&vd->vdev_dtl_lock);
-               error = space_map_load(vd->vdev_dtl_sm,
-                   vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
-               mutex_exit(&vd->vdev_dtl_lock);
+               rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+               error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
+               if (error == 0) {
+                       mutex_enter(&vd->vdev_dtl_lock);
+                       range_tree_walk(rt, range_tree_add,
+                           vd->vdev_dtl[DTL_MISSING]);
+                       mutex_exit(&vd->vdev_dtl_lock);
+               }
+
+               range_tree_vacate(rt, NULL, NULL);
+               range_tree_destroy(rt);
 
                return (error);
        }
@@ -2967,6 +3428,12 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
                                vdev_zap_allocation_data(vd, tx);
                }
        }
+       if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
+           spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
+               if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
+                       spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
+               vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
+       }
 
        for (uint64_t i = 0; i < vd->vdev_children; i++) {
                vdev_construct_zaps(vd->vdev_child[i], tx);
@@ -3152,20 +3619,54 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 int
 vdev_load(vdev_t *vd)
 {
+       int children = vd->vdev_children;
        int error = 0;
+       taskq_t *tq = NULL;
+
+       /*
+        * It's only worthwhile to use the taskq for the root vdev, because the
+        * slow part is metaslab_init, and that only happens for top-level
+        * vdevs.
+        */
+       if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
+               tq = taskq_create("vdev_load", children, minclsyspri,
+                   children, children, TASKQ_PREPOPULATE);
+       }
 
        /*
         * Recursively load all children.
         */
        for (int c = 0; c < vd->vdev_children; c++) {
-               error = vdev_load(vd->vdev_child[c]);
-               if (error != 0) {
-                       return (error);
+               vdev_t *cvd = vd->vdev_child[c];
+
+               if (tq == NULL || vdev_uses_zvols(cvd)) {
+                       cvd->vdev_load_error = vdev_load(cvd);
+               } else {
+                       VERIFY(taskq_dispatch(tq, vdev_load_child,
+                           cvd, TQ_SLEEP) != TASKQID_INVALID);
                }
        }
 
+       if (tq != NULL) {
+               taskq_wait(tq);
+               taskq_destroy(tq);
+       }
+
+       for (int c = 0; c < vd->vdev_children; c++) {
+               int error = vd->vdev_child[c]->vdev_load_error;
+
+               if (error != 0)
+                       return (error);
+       }
+
        vdev_set_deflate_ratio(vd);
 
+       if (vd->vdev_ops == &vdev_raidz_ops) {
+               error = vdev_raidz_load(vd);
+               if (error != 0)
+                       return (error);
+       }
+
        /*
         * On spa_load path, grab the allocation bias from our zap
         */
@@ -3183,11 +3684,32 @@ vdev_load(vdev_t *vd)
                        vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
                            VDEV_AUX_CORRUPT_DATA);
                        vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
-                           "failed [error=%d]", vd->vdev_top_zap, error);
+                           "failed [error=%d]",
+                           (u_longlong_t)vd->vdev_top_zap, error);
                        return (error);
                }
        }
 
+       if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+               spa_t *spa = vd->vdev_spa;
+               uint64_t failfast;
+
+               error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+                   vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
+                   1, &failfast);
+               if (error == 0) {
+                       vd->vdev_failfast = failfast & 1;
+               } else if (error == ENOENT) {
+                       vd->vdev_failfast = vdev_prop_default_numeric(
+                           VDEV_PROP_FAILFAST);
+               } else {
+                       vdev_dbgmsg(vd,
+                           "vdev_load: zap_lookup(top_zap=%llu) "
+                           "failed [error=%d]",
+                           (u_longlong_t)vd->vdev_top_zap, error);
+               }
+       }
+
        /*
         * Load any rebuild state from the top-level vdev zap.
         */
@@ -3202,6 +3724,39 @@ vdev_load(vdev_t *vd)
                }
        }
 
+       if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
+               uint64_t zapobj;
+
+               if (vd->vdev_top_zap != 0)
+                       zapobj = vd->vdev_top_zap;
+               else
+                       zapobj = vd->vdev_leaf_zap;
+
+               error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
+                   &vd->vdev_checksum_n);
+               if (error && error != ENOENT)
+                       vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+                           "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+               error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
+                   &vd->vdev_checksum_t);
+               if (error && error != ENOENT)
+                       vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+                           "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+               error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
+                   &vd->vdev_io_n);
+               if (error && error != ENOENT)
+                       vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+                           "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+               error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
+                   &vd->vdev_io_t);
+               if (error && error != ENOENT)
+                       vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+                           "failed [error=%d]", (u_longlong_t)zapobj, error);
+       }
+
        /*
         * If this is a top-level vdev, initialize its metaslabs.
         */
@@ -3424,8 +3979,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
            != NULL)
                metaslab_sync_done(msp, txg);
 
-       if (reassess)
+       if (reassess) {
                metaslab_sync_reassess(vd->vdev_mg);
+               if (vd->vdev_log_mg != NULL)
+                       metaslab_sync_reassess(vd->vdev_log_mg);
+       }
 }
 
 void
@@ -3486,10 +4044,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
        dmu_tx_commit(tx);
 }
 
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+       return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
-       return (vd->vdev_ops->vdev_op_asize(vd, psize));
+       return (vdev_psize_to_asize_txg(vd, psize, 0));
 }
 
 /*
@@ -3605,6 +4175,36 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
        return (spa_vdev_state_exit(spa, vd, 0));
 }
 
+int
+vdev_remove_wanted(spa_t *spa, uint64_t guid)
+{
+       vdev_t *vd;
+
+       spa_vdev_state_enter(spa, SCL_NONE);
+
+       if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+               return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+       /*
+        * If the vdev is already removed, or expanding which can trigger
+        * repartition add/remove events, then don't do anything.
+        */
+       if (vd->vdev_removed || vd->vdev_expanding)
+               return (spa_vdev_state_exit(spa, NULL, 0));
+
+       /*
+        * Confirm the vdev has been removed, otherwise don't do anything.
+        */
+       if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+               return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
+       vd->vdev_remove_wanted = B_TRUE;
+       spa_async_request(spa, SPA_ASYNC_REMOVE);
+
+       return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+
 /*
  * Online the given vdev.
  *
@@ -3625,9 +4225,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
                return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
-       if (!vd->vdev_ops->vdev_op_leaf)
-               return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
        wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
        oldstate = vd->vdev_state;
 
@@ -3666,6 +4263,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
                /* XXX - L2ARC 1.0 does not support expansion */
                if (vd->vdev_aux)
                        return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+               spa->spa_ccw_fail_time = 0;
                spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
        }
 
@@ -3695,9 +4293,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 
        if (wasoffline ||
            (oldstate < VDEV_STATE_DEGRADED &&
-           vd->vdev_state >= VDEV_STATE_DEGRADED))
+           vd->vdev_state >= VDEV_STATE_DEGRADED)) {
                spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
+               /*
+                * Asynchronously detach spare vdev if resilver or
+                * rebuild is not required
+                */
+               if (vd->vdev_unspare &&
+                   !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+                   !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
+                   !vdev_rebuild_active(tvd))
+                       spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
+       }
        return (spa_vdev_state_exit(spa, vd, 0));
 }
 
@@ -3718,6 +4326,9 @@ top:
        if (!vd->vdev_ops->vdev_op_leaf)
                return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
+       if (vd->vdev_ops == &vdev_draid_spare_ops)
+               return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
        tvd = vd->vdev_top;
        mg = tvd->vdev_mg;
        generation = spa->spa_config_generation + 1;
@@ -3746,6 +4357,7 @@ top:
                        /*
                         * Prevent any future allocations.
                         */
+                       ASSERT3P(tvd->vdev_log_mg, ==, NULL);
                        metaslab_group_passivate(mg);
                        (void) spa_vdev_state_exit(spa, vd, 0);
 
@@ -3844,9 +4456,9 @@ vdev_clear(spa_t *spa, vdev_t *vd)
                vdev_clear(spa, vd->vdev_child[c]);
 
        /*
-        * It makes no sense to "clear" an indirect vdev.
+        * It makes no sense to "clear" an indirect  or removed vdev.
         */
-       if (!vdev_is_concrete(vd))
+       if (!vdev_is_concrete(vd) || vd->vdev_removed)
                return;
 
        /*
@@ -3894,6 +4506,9 @@ vdev_clear(spa_t *spa, vdev_t *vd)
            vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
            vd->vdev_parent->vdev_child[0] == vd)
                vd->vdev_unspare = B_TRUE;
+
+       /* Clear recent error events cache (i.e. duplicate events tracking) */
+       zfs_ereport_clear(spa, vd);
 }
 
 boolean_t
@@ -3962,6 +4577,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
+       /*
+        * Exclude the dRAID spare when aggregating to avoid double counting
+        * the ops and bytes.  These IOs are counted by the physical leaves.
+        */
+       if (cvd->vdev_ops == &vdev_draid_spare_ops)
+               return;
+
        for (int t = 0; t < VS_ZIO_TYPES; t++) {
                vs->vs_ops[t] += cvs->vs_ops[t];
                vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -3976,6 +4598,8 @@ vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
+       (void) cvd;
+
        int t, b;
        for (t = 0; t < ZIO_TYPES; t++) {
                for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
@@ -4054,7 +4678,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                                vdev_get_child_stat(cvd, vs, cvs);
                        if (vsx)
                                vdev_get_child_stat_ex(cvd, vsx, cvsx);
-
                }
        } else {
                /*
@@ -4066,11 +4689,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 
                memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
-               for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
-                       vsx->vsx_active_queue[t] =
-                           vd->vdev_queue.vq_class[t].vqc_active;
-                       vsx->vsx_pend_queue[t] = avl_numnodes(
-                           &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+               for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+                       vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+                       vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
                }
        }
 }
@@ -4081,12 +4702,13 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
        vdev_t *tvd = vd->vdev_top;
        mutex_enter(&vd->vdev_stat_lock);
        if (vs) {
-               bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+               memcpy(vs, &vd->vdev_stat, sizeof (*vs));
                vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
                vs->vs_state = vd->vdev_state;
                vs->vs_rsize = vdev_get_min_asize(vd);
 
                if (vd->vdev_ops->vdev_op_leaf) {
+                       vs->vs_pspace = vd->vdev_psize;
                        vs->vs_rsize += VDEV_LABEL_START_SIZE +
                            VDEV_LABEL_END_SIZE;
                        /*
@@ -4132,7 +4754,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                vs->vs_configured_ashift = vd->vdev_top != NULL
                    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
                vs->vs_logical_ashift = vd->vdev_logical_ashift;
-               vs->vs_physical_ashift = vd->vdev_physical_ashift;
+               if (vd->vdev_physical_ashift <= ASHIFT_MAX)
+                       vs->vs_physical_ashift = vd->vdev_physical_ashift;
+               else
+                       vs->vs_physical_ashift = 0;
 
                /*
                 * Report fragmentation and rebuild progress for top-level,
@@ -4140,9 +4765,17 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                 */
                if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
                    vdev_is_concrete(vd)) {
+                       /*
+                        * The vdev fragmentation rating doesn't take into
+                        * account the embedded slog metaslab (vdev_log_mg).
+                        * Since it's only one metaslab, it would have a tiny
+                        * impact on the overall fragmentation.
+                        */
                        vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
                            vd->vdev_mg->mg_fragmentation : 0;
                }
+               vs->vs_noalloc = MAX(vd->vdev_noalloc,
+                   tvd ? tvd->vdev_noalloc : 0);
        }
 
        vdev_get_stats_ex_impl(vd, vs, vsx);
@@ -4186,8 +4819,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
        vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
        vdev_t *pvd;
        uint64_t txg = zio->io_txg;
+/* Suppress ASAN false positive */
+#ifdef __SANITIZE_ADDRESS__
+       vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
+       vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
+#else
        vdev_stat_t *vs = &vd->vdev_stat;
        vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
+#endif
        zio_type_t type = zio->io_type;
        int flags = zio->io_flags;
 
@@ -4239,7 +4878,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
                        /*
                         * Repair is the result of a rebuild issued by the
-                        * rebuild thread (vdev_rebuild_thread).
+                        * rebuild thread (vdev_rebuild_thread).  To avoid
+                        * double counting repaired bytes the virtual dRAID
+                        * spare vdev is excluded from the processed bytes.
                         */
                        if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
                                vdev_t *tvd = vd->vdev_top;
@@ -4247,8 +4888,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                                vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
                                uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
 
-                               if (vd->vdev_ops->vdev_op_leaf)
+                               if (vd->vdev_ops->vdev_op_leaf &&
+                                   vd->vdev_ops != &vdev_draid_spare_ops) {
                                        atomic_add_64(rebuilt, psize);
+                               }
                                vs->vs_rebuild_processed += psize;
                        }
 
@@ -4275,7 +4918,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
                        /*
                         * Solely for the purposes of 'zpool iostat -lqrw'
-                        * reporting use the priority to catagorize the IO.
+                        * reporting use the priority to categorize the IO.
                         * Only the following are reported to user space:
                         *
                         *   ZIO_PRIORITY_SYNC_READ,
@@ -4283,13 +4926,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                         *   ZIO_PRIORITY_ASYNC_READ,
                         *   ZIO_PRIORITY_ASYNC_WRITE,
                         *   ZIO_PRIORITY_SCRUB,
-                        *   ZIO_PRIORITY_TRIM.
+                        *   ZIO_PRIORITY_TRIM,
+                        *   ZIO_PRIORITY_REBUILD.
                         */
-                       if (priority == ZIO_PRIORITY_REBUILD) {
-                               priority = ((type == ZIO_TYPE_WRITE) ?
-                                   ZIO_PRIORITY_ASYNC_WRITE :
-                                   ZIO_PRIORITY_SCRUB);
-                       } else if (priority == ZIO_PRIORITY_INITIALIZING) {
+                       if (priority == ZIO_PRIORITY_INITIALIZING) {
                                ASSERT3U(type, ==, ZIO_TYPE_WRITE);
                                priority = ZIO_PRIORITY_ASYNC_WRITE;
                        } else if (priority == ZIO_PRIORITY_REMOVAL) {
@@ -4344,8 +4984,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
        if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
                return;
 
-       if (spa->spa_load_state == SPA_LOAD_NONE &&
-           type == ZIO_TYPE_WRITE && txg != 0 &&
+       if (type == ZIO_TYPE_WRITE && txg != 0 &&
            (!(flags & ZIO_FLAG_IO_REPAIR) ||
            (flags & ZIO_FLAG_SCAN_THREAD) ||
            spa->spa_claiming)) {
@@ -4404,6 +5043,7 @@ void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
+       (void) defer_delta;
        int64_t dspace_delta;
        spa_t *spa = vd->vdev_spa;
        vdev_t *rvd = spa->spa_root_vdev;
@@ -4759,7 +5399,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
                        }
 
                        (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
-                           save_state, 0);
+                           save_state);
                }
 
                /* Erase any notion of persistent removed state */
@@ -4811,10 +5451,8 @@ vdev_is_bootable(vdev_t *vd)
        if (!vd->vdev_ops->vdev_op_leaf) {
                const char *vdev_type = vd->vdev_ops->vdev_op_type;
 
-               if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
-                   strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
+               if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
                        return (B_FALSE);
-               }
        }
 
        for (int c = 0; c < vd->vdev_children; c++) {
@@ -4867,7 +5505,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
 
        vdev_set_deflate_ratio(vd);
 
-       if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+       if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+           vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+           (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
            vdev_is_concrete(vd)) {
                vdev_metaslab_group_create(vd);
                VERIFY(vdev_metaslab_init(vd, txg) == 0);
@@ -4883,9 +5523,13 @@ vdev_split(vdev_t *vd)
 {
        vdev_t *cvd, *pvd = vd->vdev_parent;
 
+       VERIFY3U(pvd->vdev_children, >, 1);
+
        vdev_remove_child(pvd, vd);
        vdev_compact_children(pvd);
 
+       ASSERT3P(pvd->vdev_child, !=, NULL);
+
        cvd = pvd->vdev_child[0];
        if (pvd->vdev_children == 1) {
                vdev_remove_parent(cvd);
@@ -4895,7 +5539,7 @@ vdev_split(vdev_t *vd)
 }
 
 void
-vdev_deadman(vdev_t *vd, char *tag)
+vdev_deadman(vdev_t *vd, const char *tag)
 {
        for (int c = 0; c < vd->vdev_children; c++) {
                vdev_t *cvd = vd->vdev_child[c];
@@ -4907,20 +5551,20 @@ vdev_deadman(vdev_t *vd, char *tag)
                vdev_queue_t *vq = &vd->vdev_queue;
 
                mutex_enter(&vq->vq_lock);
-               if (avl_numnodes(&vq->vq_active_tree) > 0) {
+               if (vq->vq_active > 0) {
                        spa_t *spa = vd->vdev_spa;
                        zio_t *fio;
                        uint64_t delta;
 
-                       zfs_dbgmsg("slow vdev: %s has %d active IOs",
-                           vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+                       zfs_dbgmsg("slow vdev: %s has %u active IOs",
+                           vd->vdev_path, vq->vq_active);
 
                        /*
                         * Look at the head of all the pending queues,
                         * if any I/O has been outstanding for longer than
                         * the spa_deadman_synctime invoke the deadman logic.
                         */
-                       fio = avl_first(&vq->vq_active_tree);
+                       fio = list_head(&vq->vq_active_list);
                        delta = gethrtime() - fio->io_timestamp;
                        if (delta > spa_deadman_synctime(spa))
                                zio_deadman(fio, tag);
@@ -4972,31 +5616,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
            vdev_resilver_needed(vd, NULL, NULL));
 }
 
+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+       return (rs->rs_start == rs->rs_end);
+}
+
 /*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t.  This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
  */
 void
 vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs)
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
        /*
         * Walk up the vdev tree
         */
        if (vd != vd->vdev_top) {
-               vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+               vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+                   remain_rs);
        } else {
                /*
-                * We've reached the top-level vdev, initialize the
-                * physical range to the logical range and start to
-                * unwind.
+                * We've reached the top-level vdev, initialize the physical
+                * range to the logical range and set an empty remaining
+                * range then start to unwind.
                 */
                physical_rs->rs_start = logical_rs->rs_start;
                physical_rs->rs_end = logical_rs->rs_end;
+
+               remain_rs->rs_start = logical_rs->rs_start;
+               remain_rs->rs_end = logical_rs->rs_start;
+
                return;
        }
 
@@ -5006,16 +5661,57 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
 
        /*
         * As this recursive function unwinds, translate the logical
-        * range into its physical components by calling the
-        * vdev specific translate function.
+        * range into its physical and any remaining components by calling
+        * the vdev specific translate function.
         */
        range_seg64_t intermediate = { 0 };
-       pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+       pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
 
        physical_rs->rs_start = intermediate.rs_start;
        physical_rs->rs_end = intermediate.rs_end;
 }
 
+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+    vdev_xlate_func_t *func, void *arg)
+{
+       range_seg64_t iter_rs = *logical_rs;
+       range_seg64_t physical_rs;
+       range_seg64_t remain_rs;
+
+       while (!vdev_xlate_is_empty(&iter_rs)) {
+
+               vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+               /*
+                * With raidz and dRAID, it's possible that the logical range
+                * does not live on this leaf vdev. Only when there is a non-
+                * zero physical size call the provided function.
+                */
+               if (!vdev_xlate_is_empty(&physical_rs))
+                       func(arg, &physical_rs);
+
+               iter_rs = remain_rs;
+       }
+}
+
+static char *
+vdev_name(vdev_t *vd, char *buf, int buflen)
+{
+       if (vd->vdev_path == NULL) {
+               if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
+                       strlcpy(buf, vd->vdev_spa->spa_name, buflen);
+               } else if (!vd->vdev_ops->vdev_op_leaf) {
+                       snprintf(buf, buflen, "%s-%llu",
+                           vd->vdev_ops->vdev_op_type,
+                           (u_longlong_t)vd->vdev_id);
+               }
+       } else {
+               strlcpy(buf, vd->vdev_path, buflen);
+       }
+       return (buf);
+}
+
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
@@ -5045,31 +5741,711 @@ vdev_replace_in_progress(vdev_t *vdev)
        return (B_FALSE);
 }
 
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
+    uint64_t intval, zprop_source_t src)
+{
+       nvlist_t *propval;
+
+       propval = fnvlist_alloc();
+       fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
+
+       if (strval != NULL)
+               fnvlist_add_string(propval, ZPROP_VALUE, strval);
+       else
+               fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
+
+       fnvlist_add_nvlist(nvl, propname, propval);
+       nvlist_free(propval);
+}
+
+static void
+vdev_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+       vdev_t *vd;
+       nvlist_t *nvp = arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       objset_t *mos = spa->spa_meta_objset;
+       nvpair_t *elem = NULL;
+       uint64_t vdev_guid;
+       uint64_t objid;
+       nvlist_t *nvprops;
+
+       vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
+       nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
+       vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+
+       /* this vdev could get removed while waiting for this sync task */
+       if (vd == NULL)
+               return;
+
+       /*
+        * Set vdev property values in the vdev props mos object.
+        */
+       if (vd->vdev_root_zap != 0) {
+               objid = vd->vdev_root_zap;
+       } else if (vd->vdev_top_zap != 0) {
+               objid = vd->vdev_top_zap;
+       } else if (vd->vdev_leaf_zap != 0) {
+               objid = vd->vdev_leaf_zap;
+       } else {
+               panic("unexpected vdev type");
+       }
+
+       mutex_enter(&spa->spa_props_lock);
+
+       while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+               uint64_t intval;
+               const char *strval;
+               vdev_prop_t prop;
+               const char *propname = nvpair_name(elem);
+               zprop_type_t proptype;
+
+               switch (prop = vdev_name_to_prop(propname)) {
+               case VDEV_PROP_USERPROP:
+                       if (vdev_prop_user(propname)) {
+                               strval = fnvpair_value_string(elem);
+                               if (strlen(strval) == 0) {
+                                       /* remove the property if value == "" */
+                                       (void) zap_remove(mos, objid, propname,
+                                           tx);
+                               } else {
+                                       VERIFY0(zap_update(mos, objid, propname,
+                                           1, strlen(strval) + 1, strval, tx));
+                               }
+                               spa_history_log_internal(spa, "vdev set", tx,
+                                   "vdev_guid=%llu: %s=%s",
+                                   (u_longlong_t)vdev_guid, nvpair_name(elem),
+                                   strval);
+                       }
+                       break;
+               default:
+                       /* normalize the property name */
+                       propname = vdev_prop_to_name(prop);
+                       proptype = vdev_prop_get_type(prop);
+
+                       if (nvpair_type(elem) == DATA_TYPE_STRING) {
+                               ASSERT(proptype == PROP_TYPE_STRING);
+                               strval = fnvpair_value_string(elem);
+                               VERIFY0(zap_update(mos, objid, propname,
+                                   1, strlen(strval) + 1, strval, tx));
+                               spa_history_log_internal(spa, "vdev set", tx,
+                                   "vdev_guid=%llu: %s=%s",
+                                   (u_longlong_t)vdev_guid, nvpair_name(elem),
+                                   strval);
+                       } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+                               intval = fnvpair_value_uint64(elem);
+
+                               if (proptype == PROP_TYPE_INDEX) {
+                                       const char *unused;
+                                       VERIFY0(vdev_prop_index_to_string(
+                                           prop, intval, &unused));
+                               }
+                               VERIFY0(zap_update(mos, objid, propname,
+                                   sizeof (uint64_t), 1, &intval, tx));
+                               spa_history_log_internal(spa, "vdev set", tx,
+                                   "vdev_guid=%llu: %s=%lld",
+                                   (u_longlong_t)vdev_guid,
+                                   nvpair_name(elem), (longlong_t)intval);
+                       } else {
+                               panic("invalid vdev property type %u",
+                                   nvpair_type(elem));
+                       }
+               }
+
+       }
+
+       mutex_exit(&spa->spa_props_lock);
+}
+
+int
+vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
+{
+       spa_t *spa = vd->vdev_spa;
+       nvpair_t *elem = NULL;
+       uint64_t vdev_guid;
+       nvlist_t *nvprops;
+       int error = 0;
+
+       ASSERT(vd != NULL);
+
+       /* Check that vdev has a zap we can use */
+       if (vd->vdev_root_zap == 0 &&
+           vd->vdev_top_zap == 0 &&
+           vd->vdev_leaf_zap == 0)
+               return (SET_ERROR(EINVAL));
+
+       if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
+           &vdev_guid) != 0)
+               return (SET_ERROR(EINVAL));
+
+       if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
+           &nvprops) != 0)
+               return (SET_ERROR(EINVAL));
+
+       if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
+               return (SET_ERROR(EINVAL));
+
+       while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+               const char *propname = nvpair_name(elem);
+               vdev_prop_t prop = vdev_name_to_prop(propname);
+               uint64_t intval = 0;
+               const char *strval = NULL;
+
+               if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
+                       error = EINVAL;
+                       goto end;
+               }
+
+               if (vdev_prop_readonly(prop)) {
+                       error = EROFS;
+                       goto end;
+               }
+
+               /* Special Processing */
+               switch (prop) {
+               case VDEV_PROP_PATH:
+                       if (vd->vdev_path == NULL) {
+                               error = EROFS;
+                               break;
+                       }
+                       if (nvpair_value_string(elem, &strval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       /* New path must start with /dev/ */
+                       if (strncmp(strval, "/dev/", 5)) {
+                               error = EINVAL;
+                               break;
+                       }
+                       error = spa_vdev_setpath(spa, vdev_guid, strval);
+                       break;
+               case VDEV_PROP_ALLOCATING:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       if (intval != vd->vdev_noalloc)
+                               break;
+                       if (intval == 0)
+                               error = spa_vdev_noalloc(spa, vdev_guid);
+                       else
+                               error = spa_vdev_alloc(spa, vdev_guid);
+                       break;
+               case VDEV_PROP_FAILFAST:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_failfast = intval & 1;
+                       break;
+               case VDEV_PROP_CHECKSUM_N:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_checksum_n = intval;
+                       break;
+               case VDEV_PROP_CHECKSUM_T:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_checksum_t = intval;
+                       break;
+               case VDEV_PROP_IO_N:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_io_n = intval;
+                       break;
+               case VDEV_PROP_IO_T:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_io_t = intval;
+                       break;
+               default:
+                       /* Most processing is done in vdev_props_set_sync */
+                       break;
+               }
+end:
+               if (error != 0) {
+                       intval = error;
+                       vdev_prop_add_list(outnvl, propname, strval, intval, 0);
+                       return (error);
+               }
+       }
+
+       return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
+           innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+int
+vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
+{
+       spa_t *spa = vd->vdev_spa;
+       objset_t *mos = spa->spa_meta_objset;
+       int err = 0;
+       uint64_t objid;
+       uint64_t vdev_guid;
+       nvpair_t *elem = NULL;
+       nvlist_t *nvprops = NULL;
+       uint64_t intval = 0;
+       char *strval = NULL;
+       const char *propname = NULL;
+       vdev_prop_t prop;
+
+       ASSERT(vd != NULL);
+       ASSERT(mos != NULL);
+
+       if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
+           &vdev_guid) != 0)
+               return (SET_ERROR(EINVAL));
+
+       nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
+
+       if (vd->vdev_root_zap != 0) {
+               objid = vd->vdev_root_zap;
+       } else if (vd->vdev_top_zap != 0) {
+               objid = vd->vdev_top_zap;
+       } else if (vd->vdev_leaf_zap != 0) {
+               objid = vd->vdev_leaf_zap;
+       } else {
+               return (SET_ERROR(EINVAL));
+       }
+       ASSERT(objid != 0);
+
+       mutex_enter(&spa->spa_props_lock);
+
+       if (nvprops != NULL) {
+               char namebuf[64] = { 0 };
+
+               while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+                       intval = 0;
+                       strval = NULL;
+                       propname = nvpair_name(elem);
+                       prop = vdev_name_to_prop(propname);
+                       zprop_source_t src = ZPROP_SRC_DEFAULT;
+                       uint64_t integer_size, num_integers;
+
+                       switch (prop) {
+                       /* Special Read-only Properties */
+                       case VDEV_PROP_NAME:
+                               strval = vdev_name(vd, namebuf,
+                                   sizeof (namebuf));
+                               if (strval == NULL)
+                                       continue;
+                               vdev_prop_add_list(outnvl, propname, strval, 0,
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_CAPACITY:
+                               /* percent used */
+                               intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
+                                   (vd->vdev_stat.vs_alloc * 100 /
+                                   vd->vdev_stat.vs_dspace);
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   intval, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_STATE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_state, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_GUID:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_guid, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_ASIZE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_asize, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_PSIZE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_psize, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_ASHIFT:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_ashift, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_SIZE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_FREE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_dspace -
+                                   vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_ALLOCATED:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_EXPANDSZ:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_FRAGMENTATION:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_fragmentation,
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_PARITY:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vdev_get_nparity(vd), ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_PATH:
+                               if (vd->vdev_path == NULL)
+                                       continue;
+                               vdev_prop_add_list(outnvl, propname,
+                                   vd->vdev_path, 0, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_DEVID:
+                               if (vd->vdev_devid == NULL)
+                                       continue;
+                               vdev_prop_add_list(outnvl, propname,
+                                   vd->vdev_devid, 0, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_PHYS_PATH:
+                               if (vd->vdev_physpath == NULL)
+                                       continue;
+                               vdev_prop_add_list(outnvl, propname,
+                                   vd->vdev_physpath, 0, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_ENC_PATH:
+                               if (vd->vdev_enc_sysfs_path == NULL)
+                                       continue;
+                               vdev_prop_add_list(outnvl, propname,
+                                   vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_FRU:
+                               if (vd->vdev_fru == NULL)
+                                       continue;
+                               vdev_prop_add_list(outnvl, propname,
+                                   vd->vdev_fru, 0, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_PARENT:
+                               if (vd->vdev_parent != NULL) {
+                                       strval = vdev_name(vd->vdev_parent,
+                                           namebuf, sizeof (namebuf));
+                                       vdev_prop_add_list(outnvl, propname,
+                                           strval, 0, ZPROP_SRC_NONE);
+                               }
+                               continue;
+                       case VDEV_PROP_CHILDREN:
+                               if (vd->vdev_children > 0)
+                                       strval = kmem_zalloc(ZAP_MAXVALUELEN,
+                                           KM_SLEEP);
+                               for (uint64_t i = 0; i < vd->vdev_children;
+                                   i++) {
+                                       const char *vname;
+
+                                       vname = vdev_name(vd->vdev_child[i],
+                                           namebuf, sizeof (namebuf));
+                                       if (vname == NULL)
+                                               vname = "(unknown)";
+                                       if (strlen(strval) > 0)
+                                               strlcat(strval, ",",
+                                                   ZAP_MAXVALUELEN);
+                                       strlcat(strval, vname, ZAP_MAXVALUELEN);
+                               }
+                               if (strval != NULL) {
+                                       vdev_prop_add_list(outnvl, propname,
+                                           strval, 0, ZPROP_SRC_NONE);
+                                       kmem_free(strval, ZAP_MAXVALUELEN);
+                               }
+                               continue;
+                       case VDEV_PROP_NUMCHILDREN:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_children, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_READ_ERRORS:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_read_errors,
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_WRITE_ERRORS:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_write_errors,
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_CHECKSUM_ERRORS:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_checksum_errors,
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_INITIALIZE_ERRORS:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_initialize_errors,
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_OPS_NULL:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_OPS_READ:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_OPS_WRITE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_OPS_FREE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_OPS_CLAIM:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_OPS_TRIM:
+                               /*
+                                * TRIM ops and bytes are reported to user
+                                * space as ZIO_TYPE_IOCTL.  This is done to
+                                * preserve the vdev_stat_t structure layout
+                                * for user space.
+                                */
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_BYTES_NULL:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_BYTES_READ:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_BYTES_WRITE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_BYTES_FREE:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_BYTES_CLAIM:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_BYTES_TRIM:
+                               /*
+                                * TRIM ops and bytes are reported to user
+                                * space as ZIO_TYPE_IOCTL.  This is done to
+                                * preserve the vdev_stat_t structure layout
+                                * for user space.
+                                */
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL],
+                                   ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_REMOVING:
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   vd->vdev_removing, ZPROP_SRC_NONE);
+                               continue;
+                       case VDEV_PROP_RAIDZ_EXPANDING:
+                               /* Only expose this for raidz */
+                               if (vd->vdev_ops == &vdev_raidz_ops) {
+                                       vdev_prop_add_list(outnvl, propname,
+                                           NULL, vd->vdev_rz_expanding,
+                                           ZPROP_SRC_NONE);
+                               }
+                               continue;
+                       /* Numeric Properites */
+                       case VDEV_PROP_ALLOCATING:
+                               /* Leaf vdevs cannot have this property */
+                               if (vd->vdev_mg == NULL &&
+                                   vd->vdev_top != NULL) {
+                                       src = ZPROP_SRC_NONE;
+                                       intval = ZPROP_BOOLEAN_NA;
+                               } else {
+                                       err = vdev_prop_get_int(vd, prop,
+                                           &intval);
+                                       if (err && err != ENOENT)
+                                               break;
+
+                                       if (intval ==
+                                           vdev_prop_default_numeric(prop))
+                                               src = ZPROP_SRC_DEFAULT;
+                                       else
+                                               src = ZPROP_SRC_LOCAL;
+                               }
+
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   intval, src);
+                               break;
+                       case VDEV_PROP_FAILFAST:
+                               src = ZPROP_SRC_LOCAL;
+                               strval = NULL;
+
+                               err = zap_lookup(mos, objid, nvpair_name(elem),
+                                   sizeof (uint64_t), 1, &intval);
+                               if (err == ENOENT) {
+                                       intval = vdev_prop_default_numeric(
+                                           prop);
+                                       err = 0;
+                               } else if (err) {
+                                       break;
+                               }
+                               if (intval == vdev_prop_default_numeric(prop))
+                                       src = ZPROP_SRC_DEFAULT;
+
+                               vdev_prop_add_list(outnvl, propname, strval,
+                                   intval, src);
+                               break;
+                       case VDEV_PROP_CHECKSUM_N:
+                       case VDEV_PROP_CHECKSUM_T:
+                       case VDEV_PROP_IO_N:
+                       case VDEV_PROP_IO_T:
+                               err = vdev_prop_get_int(vd, prop, &intval);
+                               if (err && err != ENOENT)
+                                       break;
+
+                               if (intval == vdev_prop_default_numeric(prop))
+                                       src = ZPROP_SRC_DEFAULT;
+                               else
+                                       src = ZPROP_SRC_LOCAL;
+
+                               vdev_prop_add_list(outnvl, propname, NULL,
+                                   intval, src);
+                               break;
+                       /* Text Properties */
+                       case VDEV_PROP_COMMENT:
+                               /* Exists in the ZAP below */
+                               /* FALLTHRU */
+                       case VDEV_PROP_USERPROP:
+                               /* User Properites */
+                               src = ZPROP_SRC_LOCAL;
+
+                               err = zap_length(mos, objid, nvpair_name(elem),
+                                   &integer_size, &num_integers);
+                               if (err)
+                                       break;
+
+                               switch (integer_size) {
+                               case 8:
+                                       /* User properties cannot be integers */
+                                       err = EINVAL;
+                                       break;
+                               case 1:
+                                       /* string property */
+                                       strval = kmem_alloc(num_integers,
+                                           KM_SLEEP);
+                                       err = zap_lookup(mos, objid,
+                                           nvpair_name(elem), 1,
+                                           num_integers, strval);
+                                       if (err) {
+                                               kmem_free(strval,
+                                                   num_integers);
+                                               break;
+                                       }
+                                       vdev_prop_add_list(outnvl, propname,
+                                           strval, 0, src);
+                                       kmem_free(strval, num_integers);
+                                       break;
+                               }
+                               break;
+                       default:
+                               err = ENOENT;
+                               break;
+                       }
+                       if (err)
+                               break;
+               }
+       } else {
+               /*
+                * Get all properties from the MOS vdev property object.
+                */
+               zap_cursor_t zc;
+               zap_attribute_t za;
+               for (zap_cursor_init(&zc, mos, objid);
+                   (err = zap_cursor_retrieve(&zc, &za)) == 0;
+                   zap_cursor_advance(&zc)) {
+                       intval = 0;
+                       strval = NULL;
+                       zprop_source_t src = ZPROP_SRC_DEFAULT;
+                       propname = za.za_name;
+
+                       switch (za.za_integer_length) {
+                       case 8:
+                               /* We do not allow integer user properties */
+                               /* This is likely an internal value */
+                               break;
+                       case 1:
+                               /* string property */
+                               strval = kmem_alloc(za.za_num_integers,
+                                   KM_SLEEP);
+                               err = zap_lookup(mos, objid, za.za_name, 1,
+                                   za.za_num_integers, strval);
+                               if (err) {
+                                       kmem_free(strval, za.za_num_integers);
+                                       break;
+                               }
+                               vdev_prop_add_list(outnvl, propname, strval, 0,
+                                   src);
+                               kmem_free(strval, za.za_num_integers);
+                               break;
+
+                       default:
+                               break;
+                       }
+               }
+               zap_cursor_fini(&zc);
+       }
+
+       mutex_exit(&spa->spa_props_lock);
+       if (err && err != ENOENT) {
+               return (err);
+       }
+
+       return (0);
+}
+
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
        "Target number of metaslabs per top-level vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
-       "Default limit for metaslab size");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
+       "Default lower limit for metaslab size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
+       "Default upper limit for metaslab size");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
        "Minimum number of metaslabs per top-level vdev");
 
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
        "Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
        "Rate limit slow IO (delay) events to this many per second");
 
+/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
        "Rate limit checksum events to this many checksum errors per second "
-       "(do not set below zed threshold).");
+       "(do not set below ZED threshold).");
+/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
        "Ignore errors during resilver/scrub");
@@ -5080,12 +6456,16 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
        "Disable cache flushes");
 
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
+       "Minimum number of metaslabs required to dedicate one for log blocks");
+
+/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
-       param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+       param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
        "Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
-       param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+       param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
        "Maximum ashift used when optimizing for logical -> physical sector "
        "size on new top-level vdevs");
 /* END CSTYLED */