]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/spa.c
Extend import_progress kstat with a notes field
[mirror_zfs.git] / module / zfs / spa.c
index c2a67fbc7c55476c518efa35cf0f1f53421e6023..2ca5e7bac1a4ea5478590368a56fbe02c9aa7b27 100644 (file)
@@ -33,6 +33,7 @@
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
  */
 
 /*
@@ -62,6 +63,7 @@
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
+#include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
+#include <cityhash.h>
+
+/*
+ * spa_thread() existed on Illumos as a parent thread for the various worker
+ * threads that actually run the pool, as a way to both reference the entire
+ * pool work as a single object, and to share properties like scheduling
+ * options. It has not yet been adapted to Linux or FreeBSD. This define is
+ * used to mark related parts of the code to make things easier for the reader,
+ * and to compile this code out. It can be removed when someone implements it,
+ * moves it to some Illumos-specific place, or removes it entirely.
+ */
+#undef HAVE_SPA_THREAD
+
+/*
+ * The "System Duty Cycle" scheduling class is an Illumos feature to help
+ * prevent CPU-intensive kernel threads from affecting latency on interactive
+ * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
+ * gated behind a define. On Illumos SDC depends on spa_thread(), but
+ * spa_thread() also has other uses, so this is a separate define.
+ */
+#undef HAVE_SYSDC
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
@@ -107,16 +130,16 @@ int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
        ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
-       ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
        ZTI_MODE_SCALE,                 /* Taskqs scale with CPUs. */
+       ZTI_MODE_SYNC,                  /* sync thread assigned */
        ZTI_MODE_NULL,                  /* don't create a taskq */
        ZTI_NMODES
 } zti_modes_t;
 
 #define        ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
 #define        ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define        ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
 #define        ZTI_SCALE       { ZTI_MODE_SCALE, 0, 1 }
+#define        ZTI_SYNC        { ZTI_MODE_SYNC, 0, 1 }
 #define        ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
 
 #define        ZTI_N(n)        ZTI_P(n, 1)
@@ -137,14 +160,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@@ -154,7 +177,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
        /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
        { ZTI_N(8),     ZTI_NULL,       ZTI_SCALE,      ZTI_NULL }, /* READ */
-       { ZTI_BATCH,    ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
+       { ZTI_SYNC,     ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
        { ZTI_SCALE,    ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
@@ -168,12 +191,24 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type,
     const char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
+/*
+ * Percentage of all CPUs that can be used by the metaslab preload taskq.
+ */
+static uint_t metaslab_preload_pct = 50;
+
 static uint_t  zio_taskq_batch_pct = 80;         /* 1 thread per cpu in pset */
 static uint_t  zio_taskq_batch_tpq;              /* threads per taskq */
+
+#ifdef HAVE_SYSDC
 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
 static const uint_t    zio_taskq_basedc = 80;    /* base duty cycle */
+#endif
 
+#ifdef HAVE_SPA_THREAD
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
+#endif
+
+static uint_t  zio_taskq_wr_iss_ncpus = 0;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1023,17 +1058,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
        uint_t count = ztip->zti_count;
        spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
        uint_t cpus, flags = TASKQ_DYNAMIC;
-       boolean_t batch = B_FALSE;
 
        switch (mode) {
        case ZTI_MODE_FIXED:
                ASSERT3U(value, >, 0);
                break;
 
-       case ZTI_MODE_BATCH:
-               batch = B_TRUE;
+       case ZTI_MODE_SYNC:
+
+               /*
+                * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+                * not to exceed the number of spa allocators.
+                */
+               if (zio_taskq_wr_iss_ncpus == 0) {
+                       count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+               } else {
+                       count = MAX(1,
+                           boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+               }
+               count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+               count = MIN(count, spa->spa_alloc_count);
+
+               /*
+                * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+                * single taskq may have more threads than 100% of online cpus.
+                */
+               value = (zio_taskq_batch_pct + count / 2) / count;
+               value = MIN(value, 100);
                flags |= TASKQ_THREADS_CPU_PCT;
-               value = MIN(zio_taskq_batch_pct, 100);
                break;
 
        case ZTI_MODE_SCALE:
@@ -1080,7 +1132,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 
        default:
                panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-                   "spa_activate()",
+                   "spa_taskqs_init()",
                    zio_type_name[t], zio_taskq_types[q], mode, value);
                break;
        }
@@ -1100,14 +1152,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                        (void) snprintf(name, sizeof (name), "%s_%s",
                            zio_type_name[t], zio_taskq_types[q]);
 
+#ifdef HAVE_SYSDC
                if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-                       if (batch)
-                               flags |= TASKQ_DC_BATCH;
-
                        (void) zio_taskq_basedc;
                        tq = taskq_create_sysdc(name, value, 50, INT_MAX,
                            spa->spa_proc, zio_taskq_basedc, flags);
                } else {
+#endif
                        pri_t pri = maxclsyspri;
                        /*
                         * The write issue taskq can be extremely CPU
@@ -1133,7 +1184,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                        }
                        tq = taskq_create_proc(name, value, pri, 50,
                            INT_MAX, spa->spa_proc, flags);
+#ifdef HAVE_SYSDC
                }
+#endif
 
                tqs->stqs_taskq[i] = tq;
        }
@@ -1161,12 +1214,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
  */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    zio_t *zio)
 {
        spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
        taskq_t *tq;
@@ -1174,12 +1226,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
        ASSERT3P(tqs->stqs_taskq, !=, NULL);
        ASSERT3U(tqs->stqs_count, !=, 0);
 
+       if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+           (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+               /* dispatch to assigned write issue taskq */
+               tq = zio->io_wr_iss_tq;
+               return (tq);
+       }
+
        if (tqs->stqs_count == 1) {
                tq = tqs->stqs_taskq[0];
        } else {
                tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
        }
+       return (tq);
+}
 
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+    zio_t *zio)
+{
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
        taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
@@ -1190,20 +1257,8 @@ void
 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags)
 {
-       spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-       taskq_t *tq;
-       taskqid_t id;
-
-       ASSERT3P(tqs->stqs_taskq, !=, NULL);
-       ASSERT3U(tqs->stqs_count, !=, 0);
-
-       if (tqs->stqs_count == 1) {
-               tq = tqs->stqs_taskq[0];
-       } else {
-               tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
-       }
-
-       id = taskq_dispatch(tq, func, arg, flags);
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+       taskqid_t id = taskq_dispatch(tq, func, arg, flags);
        if (id)
                taskq_wait_id(tq, id);
 }
@@ -1218,11 +1273,6 @@ spa_create_zio_taskqs(spa_t *spa)
        }
 }
 
-/*
- * Disabled until spa_thread() can be adapted for Linux.
- */
-#undef HAVE_SPA_THREAD
-
 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 static void
 spa_thread(void *arg)
@@ -1263,9 +1313,11 @@ spa_thread(void *arg)
                pool_unlock();
        }
 
+#ifdef HAVE_SYSDC
        if (zio_taskq_sysdc) {
                sysdc_thread_enter(curthread, 100, 0);
        }
+#endif
 
        spa->spa_proc = curproc;
        spa->spa_did = curthread->t_did;
@@ -1294,24 +1346,26 @@ spa_thread(void *arg)
 }
 #endif
 
+extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
+       metaslab_ops_t *msp = metaslab_allocator(spa);
        ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
        spa->spa_state = POOL_STATE_ACTIVE;
        spa->spa_mode = mode;
        spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 
-       spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_embedded_log_class =
-           metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops);
+       spa->spa_normal_class = metaslab_class_create(spa, msp);
+       spa->spa_log_class = metaslab_class_create(spa, msp);
+       spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
+       spa->spa_special_class = metaslab_class_create(spa, msp);
+       spa->spa_dedup_class = metaslab_class_create(spa, msp);
 
        /* Try to create a covering process */
        mutex_enter(&spa->spa_proc_lock);
@@ -1319,7 +1373,6 @@ spa_activate(spa_t *spa, spa_mode_t mode)
        ASSERT(spa->spa_proc == &p0);
        spa->spa_did = 0;
 
-       (void) spa_create_process;
 #ifdef HAVE_SPA_THREAD
        /* Only create a process if we're going to be around a while. */
        if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
@@ -1396,6 +1449,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
        spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
            1, INT_MAX, 0);
 
+       /*
+        * The taskq to preload metaslabs.
+        */
+       spa->spa_metaslab_taskq = taskq_create("z_metaslab",
+           metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
+           TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
        /*
         * Taskq dedicated to prefetcher threads: this is used to prevent the
         * pool traverse code from monopolizing the global (and limited)
@@ -1431,6 +1491,11 @@ spa_deactivate(spa_t *spa)
                spa->spa_zvol_taskq = NULL;
        }
 
+       if (spa->spa_metaslab_taskq) {
+               taskq_destroy(spa->spa_metaslab_taskq);
+               spa->spa_metaslab_taskq = NULL;
+       }
+
        if (spa->spa_prefetch_taskq) {
                taskq_destroy(spa->spa_prefetch_taskq);
                spa->spa_prefetch_taskq = NULL;
@@ -1608,16 +1673,16 @@ spa_unload_log_sm_metadata(spa_t *spa)
 {
        void *cookie = NULL;
        spa_log_sm_t *sls;
+       log_summary_entry_t *e;
+
        while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
            &cookie)) != NULL) {
                VERIFY0(sls->sls_mscount);
                kmem_free(sls, sizeof (spa_log_sm_t));
        }
 
-       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-           e != NULL; e = list_head(&spa->spa_log_summary)) {
+       while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
                VERIFY0(e->lse_mscount);
-               list_remove(&spa->spa_log_summary, e);
                kmem_free(e, sizeof (log_summary_entry_t));
        }
 
@@ -1645,6 +1710,10 @@ spa_destroy_aux_threads(spa_t *spa)
                zthr_destroy(spa->spa_livelist_condense_zthr);
                spa->spa_livelist_condense_zthr = NULL;
        }
+       if (spa->spa_raidz_expand_zthr != NULL) {
+               zthr_destroy(spa->spa_raidz_expand_zthr);
+               spa->spa_raidz_expand_zthr = NULL;
+       }
 }
 
 /*
@@ -1703,13 +1772,7 @@ spa_unload(spa_t *spa)
         * This ensures that there is no async metaslab prefetching
         * while we attempt to unload the spa.
         */
-       if (spa->spa_root_vdev != NULL) {
-               for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
-                       vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
-                       if (vc->vdev_mg != NULL)
-                               taskq_wait(vc->vdev_mg->mg_taskq);
-               }
-       }
+       taskq_wait(spa->spa_metaslab_taskq);
 
        if (spa->spa_mmp.mmp_thread)
                mmp_thread_stop(spa);
@@ -1803,6 +1866,8 @@ spa_unload(spa_t *spa)
                spa->spa_compatibility = NULL;
        }
 
+       spa->spa_raidz_expand = NULL;
+
        spa_config_exit(spa, SCL_ALL, spa);
 }
 
@@ -2387,7 +2452,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
         * When damaged consider it to be a metadata error since we cannot
         * trust the BP_GET_TYPE and BP_GET_LEVEL values.
         */
-       if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
+       if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
                atomic_inc_64(&sle->sle_meta_count);
                return (0);
        }
@@ -2941,6 +3006,7 @@ spa_spawn_aux_threads(spa_t *spa)
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+       spa_start_raidz_expansion_thread(spa);
        spa_start_indirect_condensing_thread(spa);
        spa_start_livelist_destroy_thread(spa);
        spa_start_livelist_condensing_thread(spa);
@@ -3043,6 +3109,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
        spa->spa_load_state = state;
        (void) spa_import_progress_set_state(spa_guid(spa),
            spa_load_state(spa));
+       spa_import_progress_set_notes(spa, "spa_load()");
 
        gethrestime(&spa->spa_loaded_ts);
        error = spa_load_impl(spa, type, &ereport);
@@ -3271,7 +3338,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
        uint64_t mmp_config = ub->ub_mmp_config;
        uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
        uint64_t import_delay;
-       hrtime_t import_expire;
+       hrtime_t import_expire, now;
        nvlist_t *mmp_label = NULL;
        vdev_t *rvd = spa->spa_root_vdev;
        kcondvar_t cv;
@@ -3309,7 +3376,17 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 
        import_expire = gethrtime() + import_delay;
 
-       while (gethrtime() < import_expire) {
+       spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
+           "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+
+       int interations = 0;
+       while ((now = gethrtime()) < import_expire) {
+               if (interations++ % 30 == 0) {
+                       spa_import_progress_set_notes(spa, "Checking MMP "
+                           "activity, %llu ms remaining",
+                           (u_longlong_t)NSEC2MSEC(import_expire - now));
+               }
+
                (void) spa_import_progress_set_mmp_check(spa_guid(spa),
                    NSEC2SEC(import_expire - gethrtime()));
 
@@ -3695,6 +3772,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
        }
        spa_load_note(spa, "using uberblock with txg=%llu",
            (u_longlong_t)ub->ub_txg);
+       if (ub->ub_raidz_reflow_info != 0) {
+               spa_load_note(spa, "uberblock raidz_reflow_info: "
+                   "state=%u offset=%llu",
+                   (int)RRSS_GET_STATE(ub),
+                   (u_longlong_t)RRSS_GET_OFFSET(ub));
+       }
 
 
        /*
@@ -3919,6 +4002,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
        rvd = mrvd;
        spa_config_exit(spa, SCL_ALL, FTAG);
 
+       /*
+        * If 'zpool import' used a cached config, then the on-disk hostid and
+        * hostname may be different to the cached config in ways that should
+        * prevent import.  Userspace can't discover this without a scan, but
+        * we know, so we add these values to LOAD_INFO so the caller can know
+        * the difference.
+        *
+        * Note that we have to do this before the config is regenerated,
+        * because the new config will have the hostid and hostname for this
+        * host, in readiness for import.
+        */
+       if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
+               fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
+                   fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
+       if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
+               fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
+                   fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
+
        /*
         * We will use spa_config if we decide to reload the spa or if spa_load
         * fails and we rewind. We must thus regenerate the config using the
@@ -4905,6 +5006,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
        /*
         * Retrieve the checkpoint txg if the pool has a checkpoint.
         */
+       spa_import_progress_set_notes(spa, "Loading checkpoint txg");
        error = spa_ld_read_checkpoint_txg(spa);
        if (error != 0)
                return (error);
@@ -4917,6 +5019,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * initiated. Otherwise we could be reading from indirect vdevs before
         * we have loaded their mappings.
         */
+       spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
        error = spa_ld_open_indirect_vdev_metadata(spa);
        if (error != 0)
                return (error);
@@ -4925,6 +5028,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * Retrieve the full list of active features from the MOS and check if
         * they are all supported.
         */
+       spa_import_progress_set_notes(spa, "Checking feature flags");
        error = spa_ld_check_features(spa, &missing_feat_write);
        if (error != 0)
                return (error);
@@ -4933,6 +5037,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * Load several special directories from the MOS needed by the dsl_pool
         * layer.
         */
+       spa_import_progress_set_notes(spa, "Loading special MOS directories");
        error = spa_ld_load_special_directories(spa);
        if (error != 0)
                return (error);
@@ -4940,6 +5045,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
        /*
         * Retrieve pool properties from the MOS.
         */
+       spa_import_progress_set_notes(spa, "Loading properties");
        error = spa_ld_get_props(spa);
        if (error != 0)
                return (error);
@@ -4948,6 +5054,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * Retrieve the list of auxiliary devices - cache devices and spares -
         * and open them.
         */
+       spa_import_progress_set_notes(spa, "Loading AUX vdevs");
        error = spa_ld_open_aux_vdevs(spa, type);
        if (error != 0)
                return (error);
@@ -4956,14 +5063,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * Load the metadata for all vdevs. Also check if unopenable devices
         * should be autoreplaced.
         */
+       spa_import_progress_set_notes(spa, "Loading vdev metadata");
        error = spa_ld_load_vdev_metadata(spa);
        if (error != 0)
                return (error);
 
+       spa_import_progress_set_notes(spa, "Loading dedup tables");
        error = spa_ld_load_dedup_tables(spa);
        if (error != 0)
                return (error);
 
+       spa_import_progress_set_notes(spa, "Loading BRT");
        error = spa_ld_load_brt(spa);
        if (error != 0)
                return (error);
@@ -4972,6 +5082,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * Verify the logs now to make sure we don't have any unexpected errors
         * when we claim log blocks later.
         */
+       spa_import_progress_set_notes(spa, "Verifying Log Devices");
        error = spa_ld_verify_logs(spa, type, ereport);
        if (error != 0)
                return (error);
@@ -4993,6 +5104,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * state. When performing an extreme rewind, we verify the whole pool,
         * which can take a very long time.
         */
+       spa_import_progress_set_notes(spa, "Verifying pool data");
        error = spa_ld_verify_pool_data(spa);
        if (error != 0)
                return (error);
@@ -5002,6 +5114,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * we write anything to the pool because we'd need to update the space
         * accounting using the deflated sizes.
         */
+       spa_import_progress_set_notes(spa, "Calculating deflated space");
        spa_update_dspace(spa);
 
        /*
@@ -5009,12 +5122,20 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         * pool. If we are importing the pool in read-write mode, a few
         * additional steps must be performed to finish the import.
         */
+       spa_import_progress_set_notes(spa, "Starting import");
        if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
            spa->spa_load_max_txg == UINT64_MAX)) {
                uint64_t config_cache_txg = spa->spa_config_txg;
 
                ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
+               /*
+                * Before we do any zio_write's, complete the raidz expansion
+                * scratch space copying, if necessary.
+                */
+               if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
+                       vdev_raidz_reflow_copy_scratch(spa);
+
                /*
                 * In case of a checkpoint rewind, log the original txg
                 * of the checkpointed uberblock.
@@ -5025,6 +5146,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                            (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
                }
 
+               spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
                /*
                 * Traverse the ZIL and claim all blocks.
                 */
@@ -5044,6 +5166,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                 * will have been set for us by ZIL traversal operations
                 * performed above.
                 */
+               spa_import_progress_set_notes(spa, "Syncing ZIL claims");
                txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
                /*
@@ -5051,6 +5174,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                 * next sync, we would update the config stored in vdev labels
                 * and the cachefile (by default /etc/zfs/zpool.cache).
                 */
+               spa_import_progress_set_notes(spa, "Updating configs");
                spa_ld_check_for_config_update(spa, config_cache_txg,
                    update_config_cache);
 
@@ -5059,6 +5183,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                 * Then check all DTLs to see if anything needs resilvering.
                 * The resilver will be deferred if a rebuild was started.
                 */
+               spa_import_progress_set_notes(spa, "Starting resilvers");
                if (vdev_rebuild_active(spa->spa_root_vdev)) {
                        vdev_rebuild_restart(spa);
                } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@@ -5072,6 +5197,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                 */
                spa_history_log_version(spa, "open", NULL);
 
+               spa_import_progress_set_notes(spa,
+                   "Restarting device removals");
                spa_restart_removal(spa);
                spa_spawn_aux_threads(spa);
 
@@ -5084,19 +5211,26 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                 * auxiliary threads above (from which the livelist
                 * deletion zthr is part of).
                 */
+               spa_import_progress_set_notes(spa,
+                   "Cleaning up inconsistent objsets");
                (void) dmu_objset_find(spa_name(spa),
                    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
                /*
                 * Clean up any stale temporary dataset userrefs.
                 */
+               spa_import_progress_set_notes(spa,
+                   "Cleaning up temporary userrefs");
                dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
                spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               spa_import_progress_set_notes(spa, "Restarting initialize");
                vdev_initialize_restart(spa->spa_root_vdev);
+               spa_import_progress_set_notes(spa, "Restarting TRIM");
                vdev_trim_restart(spa->spa_root_vdev);
                vdev_autotrim_restart(spa);
                spa_config_exit(spa, SCL_CONFIG, FTAG);
+               spa_import_progress_set_notes(spa, "Finished importing");
        }
 
        spa_import_progress_remove(spa_guid(spa));
@@ -6829,9 +6963,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 }
 
 /*
- * Attach a device to a mirror.  The arguments are the path to any device
- * in the mirror, and the nvroot for the new device.  If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
+ * Attach a device to a vdev specified by its guid.  The vdev type can be
+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
+ * single device). When the vdev is a single device, a mirror vdev will be
+ * automatically inserted.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
@@ -6854,7 +6989,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
        vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
        vdev_ops_t *pvops;
        char *oldvdpath, *newvdpath;
-       int newvd_isspare;
+       int newvd_isspare = B_FALSE;
        int error;
 
        ASSERT(spa_writeable(spa));
@@ -6874,25 +7009,46 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
                if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
                        return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
-               if (dsl_scan_resilvering(spa_get_dsl(spa)))
+               if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
+                   dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
                        return (spa_vdev_exit(spa, NULL, txg,
                            ZFS_ERR_RESILVER_IN_PROGRESS));
+               }
        } else {
                if (vdev_rebuild_active(rvd))
                        return (spa_vdev_exit(spa, NULL, txg,
                            ZFS_ERR_REBUILD_IN_PROGRESS));
        }
 
-       if (spa->spa_vdev_removal != NULL)
-               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+       if (spa->spa_vdev_removal != NULL) {
+               return (spa_vdev_exit(spa, NULL, txg,
+                   ZFS_ERR_DEVRM_IN_PROGRESS));
+       }
 
        if (oldvd == NULL)
                return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
-       if (!oldvd->vdev_ops->vdev_op_leaf)
+       boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
+
+       if (raidz) {
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
+                       return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+               /*
+                * Can't expand a raidz while prior expand is in progress.
+                */
+               if (spa->spa_raidz_expand != NULL) {
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+               }
+       } else if (!oldvd->vdev_ops->vdev_op_leaf) {
                return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+       }
 
-       pvd = oldvd->vdev_parent;
+       if (raidz)
+               pvd = oldvd;
+       else
+               pvd = oldvd->vdev_parent;
 
        if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
            VDEV_ALLOC_ATTACH) != 0)
@@ -6944,11 +7100,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 
        if (!replacing) {
                /*
-                * For attach, the only allowable parent is a mirror or the root
-                * vdev.
+                * For attach, the only allowable parent is a mirror or
+                * the root vdev. A raidz vdev can be attached to, but
+                * you cannot attach to a raidz child.
                 */
                if (pvd->vdev_ops != &vdev_mirror_ops &&
-                   pvd->vdev_ops != &vdev_root_ops)
+                   pvd->vdev_ops != &vdev_root_ops &&
+                   !raidz)
                        return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
                pvops = &vdev_mirror_ops;
@@ -6987,7 +7145,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
        /*
         * Make sure the new device is big enough.
         */
-       if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+       vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+       if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
                return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
        /*
@@ -6997,32 +7156,75 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
        if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
                return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
+       /*
+        * RAIDZ-expansion-specific checks.
+        */
+       if (raidz) {
+               if (vdev_raidz_attach_check(newvd) != 0)
+                       return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+               /*
+                * Fail early if a child is not healthy or being replaced
+                */
+               for (int i = 0; i < oldvd->vdev_children; i++) {
+                       if (vdev_is_dead(oldvd->vdev_child[i]) ||
+                           !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
+                               return (spa_vdev_exit(spa, newrootvd, txg,
+                                   ENXIO));
+                       }
+                       /* Also fail if reserved boot area is in-use */
+                       if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
+                           != 0) {
+                               return (spa_vdev_exit(spa, newrootvd, txg,
+                                   EADDRINUSE));
+                       }
+               }
+       }
+
+       if (raidz) {
+               /*
+                * Note: oldvdpath is freed by spa_strfree(),  but
+                * kmem_asprintf() is freed by kmem_strfree(), so we have to
+                * move it to a spa_strdup-ed string.
+                */
+               char *tmp = kmem_asprintf("raidz%u-%u",
+                   (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
+               oldvdpath = spa_strdup(tmp);
+               kmem_strfree(tmp);
+       } else {
+               oldvdpath = spa_strdup(oldvd->vdev_path);
+       }
+       newvdpath = spa_strdup(newvd->vdev_path);
+
        /*
         * If this is an in-place replacement, update oldvd's path and devid
         * to make it distinguishable from newvd, and unopenable from now on.
         */
-       if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+       if (strcmp(oldvdpath, newvdpath) == 0) {
                spa_strfree(oldvd->vdev_path);
-               oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+               oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
                    KM_SLEEP);
-               (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
-                   "%s/%s", newvd->vdev_path, "old");
+               (void) sprintf(oldvd->vdev_path, "%s/old",
+                   newvdpath);
                if (oldvd->vdev_devid != NULL) {
                        spa_strfree(oldvd->vdev_devid);
                        oldvd->vdev_devid = NULL;
                }
+               spa_strfree(oldvdpath);
+               oldvdpath = spa_strdup(oldvd->vdev_path);
        }
 
        /*
         * If the parent is not a mirror, or if we're replacing, insert the new
         * mirror/replacing/spare vdev above oldvd.
         */
-       if (pvd->vdev_ops != pvops)
+       if (!raidz && pvd->vdev_ops != pvops) {
                pvd = vdev_add_parent(oldvd, pvops);
+               ASSERT(pvd->vdev_ops == pvops);
+               ASSERT(oldvd->vdev_parent == pvd);
+       }
 
        ASSERT(pvd->vdev_top->vdev_parent == rvd);
-       ASSERT(pvd->vdev_ops == pvops);
-       ASSERT(oldvd->vdev_parent == pvd);
 
        /*
         * Extract the new device from its root and add it to pvd.
@@ -7050,41 +7252,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         */
        dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-       vdev_dtl_dirty(newvd, DTL_MISSING,
-           TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+       if (raidz) {
+               /*
+                * Wait for the youngest allocations and frees to sync,
+                * and then wait for the deferral of those frees to finish.
+                */
+               spa_vdev_config_exit(spa, NULL,
+                   txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
-       if (newvd->vdev_isspare) {
-               spa_spare_activate(newvd);
-               spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
-       }
+               vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
+               vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
+               vdev_autotrim_stop_wait(tvd);
 
-       oldvdpath = spa_strdup(oldvd->vdev_path);
-       newvdpath = spa_strdup(newvd->vdev_path);
-       newvd_isspare = newvd->vdev_isspare;
+               dtl_max_txg = spa_vdev_config_enter(spa);
 
-       /*
-        * Mark newvd's DTL dirty in this txg.
-        */
-       vdev_dirty(tvd, VDD_DTL, newvd, txg);
+               tvd->vdev_rz_expanding = B_TRUE;
 
-       /*
-        * Schedule the resilver or rebuild to restart in the future. We do
-        * this to ensure that dmu_sync-ed blocks have been stitched into the
-        * respective datasets.
-        */
-       if (rebuild) {
-               newvd->vdev_rebuild_txg = txg;
+               vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
+               vdev_config_dirty(tvd);
 
-               vdev_rebuild(tvd);
+               dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+                   dtl_max_txg);
+               dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+                   newvd, tx);
+               dmu_tx_commit(tx);
        } else {
-               newvd->vdev_resilver_txg = txg;
+               vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+                   dtl_max_txg - TXG_INITIAL);
 
-               if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
-                   spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
-                       vdev_defer_resilver(newvd);
+               if (newvd->vdev_isspare) {
+                       spa_spare_activate(newvd);
+                       spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+               }
+
+               newvd_isspare = newvd->vdev_isspare;
+
+               /*
+                * Mark newvd's DTL dirty in this txg.
+                */
+               vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+               /*
+                * Schedule the resilver or rebuild to restart in the future.
+                * We do this to ensure that dmu_sync-ed blocks have been
+                * stitched into the respective datasets.
+                */
+               if (rebuild) {
+                       newvd->vdev_rebuild_txg = txg;
+
+                       vdev_rebuild(tvd);
                } else {
-                       dsl_scan_restart_resilver(spa->spa_dsl_pool,
-                           dtl_max_txg);
+                       newvd->vdev_resilver_txg = txg;
+
+                       if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+                           spa_feature_is_enabled(spa,
+                           SPA_FEATURE_RESILVER_DEFER)) {
+                               vdev_defer_resilver(newvd);
+                       } else {
+                               dsl_scan_restart_resilver(spa->spa_dsl_pool,
+                                   dtl_max_txg);
+                       }
                }
        }
 
@@ -7409,7 +7636,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
         */
        if (cmd_type == POOL_INITIALIZE_START &&
            (vd->vdev_initialize_thread != NULL ||
-           vd->vdev_top->vdev_removing)) {
+           vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
                mutex_exit(&vd->vdev_initialize_lock);
                return (SET_ERROR(EBUSY));
        } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
@@ -7421,6 +7648,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
            vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
                mutex_exit(&vd->vdev_initialize_lock);
                return (SET_ERROR(ESRCH));
+       } else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+           vd->vdev_initialize_thread != NULL) {
+               mutex_exit(&vd->vdev_initialize_lock);
+               return (SET_ERROR(EBUSY));
        }
 
        switch (cmd_type) {
@@ -7433,6 +7664,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
        case POOL_INITIALIZE_SUSPEND:
                vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
                break;
+       case POOL_INITIALIZE_UNINIT:
+               vdev_uninitialize(vd);
+               break;
        default:
                panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
        }
@@ -7524,7 +7758,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
         * which has completed but the thread is not exited.
         */
        if (cmd_type == POOL_TRIM_START &&
-           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
+           vd->vdev_top->vdev_rz_expanding)) {
                mutex_exit(&vd->vdev_trim_lock);
                return (SET_ERROR(EBUSY));
        } else if (cmd_type == POOL_TRIM_CANCEL &&
@@ -8166,6 +8401,7 @@ spa_scan_stop(spa_t *spa)
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
        if (dsl_scan_resilvering(spa->spa_dsl_pool))
                return (SET_ERROR(EBUSY));
+
        return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
@@ -8191,6 +8427,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
                return (0);
        }
 
+       if (func == POOL_SCAN_ERRORSCRUB &&
+           !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+               return (SET_ERROR(ENOTSUP));
+
        return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
@@ -8422,6 +8662,10 @@ spa_async_suspend(spa_t *spa)
        if (condense_thread != NULL)
                zthr_cancel(condense_thread);
 
+       zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+       if (raidz_expand_thread != NULL)
+               zthr_cancel(raidz_expand_thread);
+
        zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
        if (discard_thread != NULL)
                zthr_cancel(discard_thread);
@@ -8448,6 +8692,10 @@ spa_async_resume(spa_t *spa)
        if (condense_thread != NULL)
                zthr_resume(condense_thread);
 
+       zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+       if (raidz_expand_thread != NULL)
+               zthr_resume(raidz_expand_thread);
+
        zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
        if (discard_thread != NULL)
                zthr_resume(discard_thread);
@@ -8942,12 +9190,12 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                        }
 
                        /* normalize the property name */
-                       propname = zpool_prop_to_name(prop);
-                       proptype = zpool_prop_get_type(prop);
-                       if (prop == ZPOOL_PROP_INVAL &&
-                           zfs_prop_user(elemname)) {
+                       if (prop == ZPOOL_PROP_INVAL) {
                                propname = elemname;
                                proptype = PROP_TYPE_STRING;
+                       } else {
+                               propname = zpool_prop_to_name(prop);
+                               proptype = zpool_prop_get_type(prop);
                        }
 
                        if (nvpair_type(elem) == DATA_TYPE_STRING) {
@@ -9242,6 +9490,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                brt_sync(spa, txg);
                ddt_sync(spa, txg);
                dsl_scan_sync(dp, tx);
+               dsl_errorscrub_sync(dp, tx);
                svr_sync(spa, tx);
                spa_sync_upgrades(spa, tx);
 
@@ -9252,6 +9501,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                    != NULL)
                        vdev_sync(vd, txg);
 
+               if (pass == 1) {
+                       /*
+                        * dsl_pool_sync() -> dp_sync_tasks may have dirtied
+                        * the config. If that happens, this txg should not
+                        * be a no-op. So we must sync the config to the MOS
+                        * before checking for no-op.
+                        *
+                        * Note that when the config is dirty, it will
+                        * be written to the MOS (i.e. the MOS will be
+                        * dirtied) every time we call spa_sync_config_object()
+                        * in this txg.  Therefore we can't call this after
+                        * dsl_pool_sync() every pass, because it would
+                        * prevent us from converging, since we'd dirty
+                        * the MOS every pass.
+                        *
+                        * Sync tasks can only be processed in pass 1, so
+                        * there's no need to do this in later passes.
+                        */
+                       spa_sync_config_object(spa, tx);
+               }
+
                /*
                 * Note: We need to check if the MOS is dirty because we could
                 * have marked the MOS dirty without updating the uberblock
@@ -9573,6 +9843,104 @@ spa_sync_allpools(void)
        mutex_exit(&spa_namespace_lock);
 }
 
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+       kthread_t **kthreads;
+
+       ASSERT(spa->spa_sync_tq == NULL);
+       ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+       /*
+        * - do not allow more allocators than cpus.
+        * - there may be more cpus than allocators.
+        * - do not allow more sync taskq threads than allocators or cpus.
+        */
+       int nthreads = spa->spa_alloc_count;
+       spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+           nthreads, KM_SLEEP);
+
+       spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+           nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+       VERIFY(spa->spa_sync_tq != NULL);
+       VERIFY(kthreads != NULL);
+
+       spa_taskqs_t *tqs =
+           &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+       spa_syncthread_info_t *ti = spa->spa_syncthreads;
+       for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+               ti->sti_thread = kthreads[i];
+               if (w == tqs->stqs_count) {
+                       w = 0;
+               }
+               ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+       }
+
+       kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+       return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+       ASSERT(spa->spa_sync_tq != NULL);
+
+       taskq_wait(spa->spa_sync_tq);
+       taskq_destroy(spa->spa_sync_tq);
+       kmem_free(spa->spa_syncthreads,
+           sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+       spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+       zbookmark_phys_t *bm = &zio->io_bookmark;
+       spa_t *spa = zio->io_spa;
+
+       ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+       /*
+        * A gang block (for example) may have inherited its parent's
+        * allocator, in which case there is nothing further to do here.
+        */
+       if (ZIO_HAS_ALLOCATOR(zio))
+               return;
+
+       ASSERT(spa != NULL);
+       ASSERT(bm != NULL);
+
+       /*
+        * First try to use an allocator assigned to the syncthread, and set
+        * the corresponding write issue taskq for the allocator.
+        * Note, we must have an open pool to do this.
+        */
+       if (spa->spa_sync_tq != NULL) {
+               spa_syncthread_info_t *ti = spa->spa_syncthreads;
+               for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+                       if (ti->sti_thread == curthread) {
+                               zio->io_allocator = i;
+                               zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+                               return;
+                       }
+               }
+       }
+
+       /*
+        * We want to try to use as many allocators as possible to help improve
+        * performance, but we also want logically adjacent IOs to be physically
+        * adjacent to improve sequential read performance. We chunk each object
+        * into 2^20 block regions, and then hash based on the objset, object,
+        * level, and region to accomplish both of these goals.
+        */
+       uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+           bm->zb_blkid >> 20);
+
+       zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+       zio->io_wr_iss_tq = NULL;
+}
+
 /*
  * ==========================================================================
  * Miscellaneous routines
@@ -9911,7 +10279,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                    DSS_SCANNING);
                break;
        case ZPOOL_WAIT_RESILVER:
-               if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+               *in_progress = vdev_rebuild_active(spa->spa_root_vdev);
+               if (*in_progress)
                        break;
                zfs_fallthrough;
        case ZPOOL_WAIT_SCRUB:
@@ -9926,6 +10295,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
                break;
        }
+       case ZPOOL_WAIT_RAIDZ_EXPAND:
+       {
+               vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+               *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
+               break;
+       }
        default:
                panic("unrecognized value for activity %d", activity);
        }
@@ -10116,6 +10491,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
 
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
+       "Percentage of CPUs to run a metaslab preload taskq");
+
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
        "log2 fraction of arc that can be used by inflight I/Os when "
@@ -10163,3 +10541,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
        "Whether extra ALLOC blkptrs were added to a livelist entry while it "
        "was being condensed");
 /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+       "Number of CPUs to run write issue taskqs");