Extend import_progress kstat with a notes field

[mirror_zfs.git] / module / zfs / spa.c
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index c2a67fbc7c55476c518efa35cf0f1f53421e6023..2ca5e7bac1a4ea5478590368a56fbe02c9aa7b27 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -33,6 +33,7 @@
   * Copyright 2017 Joyent, Inc.
   * Copyright (c) 2017, Intel Corporation.
   * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
   */
  
  /*
@@ -62,6 +63,7 @@
  #include <sys/vdev_rebuild.h>
  #include <sys/vdev_trim.h>
  #include <sys/vdev_disk.h>
+#include <sys/vdev_raidz.h>
  #include <sys/vdev_draid.h>
  #include <sys/metaslab.h>
  #include <sys/metaslab_impl.h>
@@ -98,6 +100,27 @@
  
  #include "zfs_prop.h"
  #include "zfs_comutil.h"
+#include <cityhash.h>
+
+/*
+ * spa_thread() existed on Illumos as a parent thread for the various worker
+ * threads that actually run the pool, as a way to both reference the entire
+ * pool work as a single object, and to share properties like scheduling
+ * options. It has not yet been adapted to Linux or FreeBSD. This define is
+ * used to mark related parts of the code to make things easier for the reader,
+ * and to compile this code out. It can be removed when someone implements it,
+ * moves it to some Illumos-specific place, or removes it entirely.
+ */
+#undef HAVE_SPA_THREAD
+
+/*
+ * The "System Duty Cycle" scheduling class is an Illumos feature to help
+ * prevent CPU-intensive kernel threads from affecting latency on interactive
+ * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
+ * gated behind a define. On Illumos SDC depends on spa_thread(), but
+ * spa_thread() also has other uses, so this is a separate define.
+ */
+#undef HAVE_SYSDC
  
  /*
   * The interval, in seconds, at which failed configuration cache file writes
@@ -107,16 +130,16 @@ int zfs_ccw_retry_interval = 300;
  
  typedef enum zti_modes {
         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
-       ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
         ZTI_MODE_SCALE,                 /* Taskqs scale with CPUs. */
+       ZTI_MODE_SYNC,                  /* sync thread assigned */
         ZTI_MODE_NULL,                  /* don't create a taskq */
         ZTI_NMODES
  } zti_modes_t;
  
  #define        ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  #define        ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define        ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  #define        ZTI_SCALE       { ZTI_MODE_SCALE, 0, 1 }
+#define        ZTI_SYNC        { ZTI_MODE_SYNC, 0, 1 }
  #define        ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
  
  #define        ZTI_N(n)        ZTI_P(n, 1)
@@ -137,14 +160,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
   * initializing a pool, we use this table to create an appropriately sized
   * taskq. Some operations are low volume and therefore have a small, static
   * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
   * macro causes us to create a taskq oriented for throughput. Some operations
   * are so high frequency and short-lived that the taskq itself can become a
   * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
   * additional degree of parallelism specified by the number of threads per-
   * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
   *
   * The different taskq priorities are to handle the different contexts (issue
   * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@@ -154,7 +177,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
         { ZTI_N(8),     ZTI_NULL,       ZTI_SCALE,      ZTI_NULL }, /* READ */
-       { ZTI_BATCH,    ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
+       { ZTI_SYNC,     ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
         { ZTI_SCALE,    ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
@@ -168,12 +191,24 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type,
      const char **ereport);
  static void spa_vdev_resilver_done(spa_t *spa);
  
+/*
+ * Percentage of all CPUs that can be used by the metaslab preload taskq.
+ */
+static uint_t metaslab_preload_pct = 50;
+
  static uint_t  zio_taskq_batch_pct = 80;         /* 1 thread per cpu in pset */
  static uint_t  zio_taskq_batch_tpq;              /* threads per taskq */
+
+#ifdef HAVE_SYSDC
  static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
  static const uint_t    zio_taskq_basedc = 80;    /* base duty cycle */
+#endif
  
+#ifdef HAVE_SPA_THREAD
  static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
+#endif
+
+static uint_t  zio_taskq_wr_iss_ncpus = 0;
  
  /*
   * Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1023,17 +1058,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
         uint_t count = ztip->zti_count;
         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
         uint_t cpus, flags = TASKQ_DYNAMIC;
-       boolean_t batch = B_FALSE;
  
         switch (mode) {
         case ZTI_MODE_FIXED:
                 ASSERT3U(value, >, 0);
                 break;
  
-       case ZTI_MODE_BATCH:
-               batch = B_TRUE;
+       case ZTI_MODE_SYNC:
+
+               /*
+                * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+                * not to exceed the number of spa allocators.
+                */
+               if (zio_taskq_wr_iss_ncpus == 0) {
+                       count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+               } else {
+                       count = MAX(1,
+                           boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+               }
+               count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+               count = MIN(count, spa->spa_alloc_count);
+
+               /*
+                * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+                * single taskq may have more threads than 100% of online cpus.
+                */
+               value = (zio_taskq_batch_pct + count / 2) / count;
+               value = MIN(value, 100);
                 flags |= TASKQ_THREADS_CPU_PCT;
-               value = MIN(zio_taskq_batch_pct, 100);
                 break;
  
         case ZTI_MODE_SCALE:
@@ -1080,7 +1132,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  
         default:
                 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-                   "spa_activate()",
+                   "spa_taskqs_init()",
                     zio_type_name[t], zio_taskq_types[q], mode, value);
                 break;
         }
@@ -1100,14 +1152,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                         (void) snprintf(name, sizeof (name), "%s_%s",
                             zio_type_name[t], zio_taskq_types[q]);
  
+#ifdef HAVE_SYSDC
                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-                       if (batch)
-                               flags |= TASKQ_DC_BATCH;
-
                         (void) zio_taskq_basedc;
                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
                             spa->spa_proc, zio_taskq_basedc, flags);
                 } else {
+#endif
                         pri_t pri = maxclsyspri;
                         /*
                          * The write issue taskq can be extremely CPU
@@ -1133,7 +1184,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                         }
                         tq = taskq_create_proc(name, value, pri, 50,
                             INT_MAX, spa->spa_proc, flags);
+#ifdef HAVE_SYSDC
                 }
+#endif
  
                 tqs->stqs_taskq[i] = tq;
         }
@@ -1161,12 +1214,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  /*
   * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
   * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
   */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    zio_t *zio)
  {
         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
         taskq_t *tq;
@@ -1174,12 +1226,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
         ASSERT3P(tqs->stqs_taskq, !=, NULL);
         ASSERT3U(tqs->stqs_count, !=, 0);
  
+       if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+           (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+               /* dispatch to assigned write issue taskq */
+               tq = zio->io_wr_iss_tq;
+               return (tq);
+       }
+
         if (tqs->stqs_count == 1) {
                 tq = tqs->stqs_taskq[0];
         } else {
                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
         }
+       return (tq);
+}
  
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+    zio_t *zio)
+{
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
         taskq_dispatch_ent(tq, func, arg, flags, ent);
  }
  
@@ -1190,20 +1257,8 @@ void
  spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
      task_func_t *func, void *arg, uint_t flags)
  {
-       spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-       taskq_t *tq;
-       taskqid_t id;
-
-       ASSERT3P(tqs->stqs_taskq, !=, NULL);
-       ASSERT3U(tqs->stqs_count, !=, 0);
-
-       if (tqs->stqs_count == 1) {
-               tq = tqs->stqs_taskq[0];
-       } else {
-               tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
-       }
-
-       id = taskq_dispatch(tq, func, arg, flags);
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+       taskqid_t id = taskq_dispatch(tq, func, arg, flags);
         if (id)
                 taskq_wait_id(tq, id);
  }
@@ -1218,11 +1273,6 @@ spa_create_zio_taskqs(spa_t *spa)
         }
  }
  
-/*
- * Disabled until spa_thread() can be adapted for Linux.
- */
-#undef HAVE_SPA_THREAD
-
  #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
  static void
  spa_thread(void *arg)
@@ -1263,9 +1313,11 @@ spa_thread(void *arg)
                 pool_unlock();
         }
  
+#ifdef HAVE_SYSDC
         if (zio_taskq_sysdc) {
                 sysdc_thread_enter(curthread, 100, 0);
         }
+#endif
  
         spa->spa_proc = curproc;
         spa->spa_did = curthread->t_did;
@@ -1294,24 +1346,26 @@ spa_thread(void *arg)
  }
  #endif
  
+extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
  /*
   * Activate an uninitialized pool.
   */
  static void
  spa_activate(spa_t *spa, spa_mode_t mode)
  {
+       metaslab_ops_t *msp = metaslab_allocator(spa);
         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
  
         spa->spa_state = POOL_STATE_ACTIVE;
         spa->spa_mode = mode;
         spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
  
-       spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_embedded_log_class =
-           metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops);
+       spa->spa_normal_class = metaslab_class_create(spa, msp);
+       spa->spa_log_class = metaslab_class_create(spa, msp);
+       spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
+       spa->spa_special_class = metaslab_class_create(spa, msp);
+       spa->spa_dedup_class = metaslab_class_create(spa, msp);
  
         /* Try to create a covering process */
         mutex_enter(&spa->spa_proc_lock);
@@ -1319,7 +1373,6 @@ spa_activate(spa_t *spa, spa_mode_t mode)
         ASSERT(spa->spa_proc == &p0);
         spa->spa_did = 0;
  
-       (void) spa_create_process;
  #ifdef HAVE_SPA_THREAD
         /* Only create a process if we're going to be around a while. */
         if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
@@ -1396,6 +1449,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
         spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
             1, INT_MAX, 0);
  
+       /*
+        * The taskq to preload metaslabs.
+        */
+       spa->spa_metaslab_taskq = taskq_create("z_metaslab",
+           metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
+           TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
         /*
          * Taskq dedicated to prefetcher threads: this is used to prevent the
          * pool traverse code from monopolizing the global (and limited)
@@ -1431,6 +1491,11 @@ spa_deactivate(spa_t *spa)
                 spa->spa_zvol_taskq = NULL;
         }
  
+       if (spa->spa_metaslab_taskq) {
+               taskq_destroy(spa->spa_metaslab_taskq);
+               spa->spa_metaslab_taskq = NULL;
+       }
+
         if (spa->spa_prefetch_taskq) {
                 taskq_destroy(spa->spa_prefetch_taskq);
                 spa->spa_prefetch_taskq = NULL;
@@ -1608,16 +1673,16 @@ spa_unload_log_sm_metadata(spa_t *spa)
  {
         void *cookie = NULL;
         spa_log_sm_t *sls;
+       log_summary_entry_t *e;
+
         while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
             &cookie)) != NULL) {
                 VERIFY0(sls->sls_mscount);
                 kmem_free(sls, sizeof (spa_log_sm_t));
         }
  
-       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-           e != NULL; e = list_head(&spa->spa_log_summary)) {
+       while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
                 VERIFY0(e->lse_mscount);
-               list_remove(&spa->spa_log_summary, e);
                 kmem_free(e, sizeof (log_summary_entry_t));
         }
  
@@ -1645,6 +1710,10 @@ spa_destroy_aux_threads(spa_t *spa)
                 zthr_destroy(spa->spa_livelist_condense_zthr);
                 spa->spa_livelist_condense_zthr = NULL;
         }
+       if (spa->spa_raidz_expand_zthr != NULL) {
+               zthr_destroy(spa->spa_raidz_expand_zthr);
+               spa->spa_raidz_expand_zthr = NULL;
+       }
  }
  
  /*
@@ -1703,13 +1772,7 @@ spa_unload(spa_t *spa)
          * This ensures that there is no async metaslab prefetching
          * while we attempt to unload the spa.
          */
-       if (spa->spa_root_vdev != NULL) {
-               for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
-                       vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
-                       if (vc->vdev_mg != NULL)
-                               taskq_wait(vc->vdev_mg->mg_taskq);
-               }
-       }
+       taskq_wait(spa->spa_metaslab_taskq);
  
         if (spa->spa_mmp.mmp_thread)
                 mmp_thread_stop(spa);
@@ -1803,6 +1866,8 @@ spa_unload(spa_t *spa)
                 spa->spa_compatibility = NULL;
         }
  
+       spa->spa_raidz_expand = NULL;
+
         spa_config_exit(spa, SCL_ALL, spa);
  }
  
@@ -2387,7 +2452,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
          * When damaged consider it to be a metadata error since we cannot
          * trust the BP_GET_TYPE and BP_GET_LEVEL values.
          */
-       if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
+       if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
                 atomic_inc_64(&sle->sle_meta_count);
                 return (0);
         }
@@ -2941,6 +3006,7 @@ spa_spawn_aux_threads(spa_t *spa)
  
         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  
+       spa_start_raidz_expansion_thread(spa);
         spa_start_indirect_condensing_thread(spa);
         spa_start_livelist_destroy_thread(spa);
         spa_start_livelist_condensing_thread(spa);
@@ -3043,6 +3109,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
         spa->spa_load_state = state;
         (void) spa_import_progress_set_state(spa_guid(spa),
             spa_load_state(spa));
+       spa_import_progress_set_notes(spa, "spa_load()");
  
         gethrestime(&spa->spa_loaded_ts);
         error = spa_load_impl(spa, type, &ereport);
@@ -3271,7 +3338,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
         uint64_t mmp_config = ub->ub_mmp_config;
         uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
         uint64_t import_delay;
-       hrtime_t import_expire;
+       hrtime_t import_expire, now;
         nvlist_t *mmp_label = NULL;
         vdev_t *rvd = spa->spa_root_vdev;
         kcondvar_t cv;
@@ -3309,7 +3376,17 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
  
         import_expire = gethrtime() + import_delay;
  
-       while (gethrtime() < import_expire) {
+       spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
+           "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+
+       int interations = 0;
+       while ((now = gethrtime()) < import_expire) {
+               if (interations++ % 30 == 0) {
+                       spa_import_progress_set_notes(spa, "Checking MMP "
+                           "activity, %llu ms remaining",
+                           (u_longlong_t)NSEC2MSEC(import_expire - now));
+               }
+
                 (void) spa_import_progress_set_mmp_check(spa_guid(spa),
                     NSEC2SEC(import_expire - gethrtime()));
  
@@ -3695,6 +3772,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
         }
         spa_load_note(spa, "using uberblock with txg=%llu",
             (u_longlong_t)ub->ub_txg);
+       if (ub->ub_raidz_reflow_info != 0) {
+               spa_load_note(spa, "uberblock raidz_reflow_info: "
+                   "state=%u offset=%llu",
+                   (int)RRSS_GET_STATE(ub),
+                   (u_longlong_t)RRSS_GET_OFFSET(ub));
+       }
  
  
         /*
@@ -3919,6 +4002,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
         rvd = mrvd;
         spa_config_exit(spa, SCL_ALL, FTAG);
  
+       /*
+        * If 'zpool import' used a cached config, then the on-disk hostid and
+        * hostname may be different to the cached config in ways that should
+        * prevent import.  Userspace can't discover this without a scan, but
+        * we know, so we add these values to LOAD_INFO so the caller can know
+        * the difference.
+        *
+        * Note that we have to do this before the config is regenerated,
+        * because the new config will have the hostid and hostname for this
+        * host, in readiness for import.
+        */
+       if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
+               fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
+                   fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
+       if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
+               fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
+                   fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
+
         /*
          * We will use spa_config if we decide to reload the spa or if spa_load
          * fails and we rewind. We must thus regenerate the config using the
@@ -4905,6 +5006,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         /*
          * Retrieve the checkpoint txg if the pool has a checkpoint.
          */
+       spa_import_progress_set_notes(spa, "Loading checkpoint txg");
         error = spa_ld_read_checkpoint_txg(spa);
         if (error != 0)
                 return (error);
@@ -4917,6 +5019,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * initiated. Otherwise we could be reading from indirect vdevs before
          * we have loaded their mappings.
          */
+       spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
         error = spa_ld_open_indirect_vdev_metadata(spa);
         if (error != 0)
                 return (error);
@@ -4925,6 +5028,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Retrieve the full list of active features from the MOS and check if
          * they are all supported.
          */
+       spa_import_progress_set_notes(spa, "Checking feature flags");
         error = spa_ld_check_features(spa, &missing_feat_write);
         if (error != 0)
                 return (error);
@@ -4933,6 +5037,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Load several special directories from the MOS needed by the dsl_pool
          * layer.
          */
+       spa_import_progress_set_notes(spa, "Loading special MOS directories");
         error = spa_ld_load_special_directories(spa);
         if (error != 0)
                 return (error);
@@ -4940,6 +5045,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         /*
          * Retrieve pool properties from the MOS.
          */
+       spa_import_progress_set_notes(spa, "Loading properties");
         error = spa_ld_get_props(spa);
         if (error != 0)
                 return (error);
@@ -4948,6 +5054,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Retrieve the list of auxiliary devices - cache devices and spares -
          * and open them.
          */
+       spa_import_progress_set_notes(spa, "Loading AUX vdevs");
         error = spa_ld_open_aux_vdevs(spa, type);
         if (error != 0)
                 return (error);
@@ -4956,14 +5063,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Load the metadata for all vdevs. Also check if unopenable devices
          * should be autoreplaced.
          */
+       spa_import_progress_set_notes(spa, "Loading vdev metadata");
         error = spa_ld_load_vdev_metadata(spa);
         if (error != 0)
                 return (error);
  
+       spa_import_progress_set_notes(spa, "Loading dedup tables");
         error = spa_ld_load_dedup_tables(spa);
         if (error != 0)
                 return (error);
  
+       spa_import_progress_set_notes(spa, "Loading BRT");
         error = spa_ld_load_brt(spa);
         if (error != 0)
                 return (error);
@@ -4972,6 +5082,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Verify the logs now to make sure we don't have any unexpected errors
          * when we claim log blocks later.
          */
+       spa_import_progress_set_notes(spa, "Verifying Log Devices");
         error = spa_ld_verify_logs(spa, type, ereport);
         if (error != 0)
                 return (error);
@@ -4993,6 +5104,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * state. When performing an extreme rewind, we verify the whole pool,
          * which can take a very long time.
          */
+       spa_import_progress_set_notes(spa, "Verifying pool data");
         error = spa_ld_verify_pool_data(spa);
         if (error != 0)
                 return (error);
@@ -5002,6 +5114,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * we write anything to the pool because we'd need to update the space
          * accounting using the deflated sizes.
          */
+       spa_import_progress_set_notes(spa, "Calculating deflated space");
         spa_update_dspace(spa);
  
         /*
@@ -5009,12 +5122,20 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * pool. If we are importing the pool in read-write mode, a few
          * additional steps must be performed to finish the import.
          */
+       spa_import_progress_set_notes(spa, "Starting import");
         if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
             spa->spa_load_max_txg == UINT64_MAX)) {
                 uint64_t config_cache_txg = spa->spa_config_txg;
  
                 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
  
+               /*
+                * Before we do any zio_write's, complete the raidz expansion
+                * scratch space copying, if necessary.
+                */
+               if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
+                       vdev_raidz_reflow_copy_scratch(spa);
+
                 /*
                  * In case of a checkpoint rewind, log the original txg
                  * of the checkpointed uberblock.
@@ -5025,6 +5146,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                             (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
                 }
  
+               spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
                 /*
                  * Traverse the ZIL and claim all blocks.
                  */
@@ -5044,6 +5166,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * will have been set for us by ZIL traversal operations
                  * performed above.
                  */
+               spa_import_progress_set_notes(spa, "Syncing ZIL claims");
                 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
  
                 /*
@@ -5051,6 +5174,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * next sync, we would update the config stored in vdev labels
                  * and the cachefile (by default /etc/zfs/zpool.cache).
                  */
+               spa_import_progress_set_notes(spa, "Updating configs");
                 spa_ld_check_for_config_update(spa, config_cache_txg,
                     update_config_cache);
  
@@ -5059,6 +5183,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * Then check all DTLs to see if anything needs resilvering.
                  * The resilver will be deferred if a rebuild was started.
                  */
+               spa_import_progress_set_notes(spa, "Starting resilvers");
                 if (vdev_rebuild_active(spa->spa_root_vdev)) {
                         vdev_rebuild_restart(spa);
                 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@@ -5072,6 +5197,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  */
                 spa_history_log_version(spa, "open", NULL);
  
+               spa_import_progress_set_notes(spa,
+                   "Restarting device removals");
                 spa_restart_removal(spa);
                 spa_spawn_aux_threads(spa);
  
@@ -5084,19 +5211,26 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * auxiliary threads above (from which the livelist
                  * deletion zthr is part of).
                  */
+               spa_import_progress_set_notes(spa,
+                   "Cleaning up inconsistent objsets");
                 (void) dmu_objset_find(spa_name(spa),
                     dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
  
                 /*
                  * Clean up any stale temporary dataset userrefs.
                  */
+               spa_import_progress_set_notes(spa,
+                   "Cleaning up temporary userrefs");
                 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
  
                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               spa_import_progress_set_notes(spa, "Restarting initialize");
                 vdev_initialize_restart(spa->spa_root_vdev);
+               spa_import_progress_set_notes(spa, "Restarting TRIM");
                 vdev_trim_restart(spa->spa_root_vdev);
                 vdev_autotrim_restart(spa);
                 spa_config_exit(spa, SCL_CONFIG, FTAG);
+               spa_import_progress_set_notes(spa, "Finished importing");
         }
  
         spa_import_progress_remove(spa_guid(spa));
@@ -6829,9 +6963,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  }
  
  /*
- * Attach a device to a mirror.  The arguments are the path to any device
- * in the mirror, and the nvroot for the new device.  If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
+ * Attach a device to a vdev specified by its guid.  The vdev type can be
+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
+ * single device). When the vdev is a single device, a mirror vdev will be
+ * automatically inserted.
   *
   * If 'replacing' is specified, the new device is intended to replace the
   * existing device; in this case the two devices are made into their own
@@ -6854,7 +6989,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
         vdev_ops_t *pvops;
         char *oldvdpath, *newvdpath;
-       int newvd_isspare;
+       int newvd_isspare = B_FALSE;
         int error;
  
         ASSERT(spa_writeable(spa));
@@ -6874,25 +7009,46 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
                 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
                         return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
  
-               if (dsl_scan_resilvering(spa_get_dsl(spa)))
+               if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
+                   dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
                         return (spa_vdev_exit(spa, NULL, txg,
                             ZFS_ERR_RESILVER_IN_PROGRESS));
+               }
         } else {
                 if (vdev_rebuild_active(rvd))
                         return (spa_vdev_exit(spa, NULL, txg,
                             ZFS_ERR_REBUILD_IN_PROGRESS));
         }
  
-       if (spa->spa_vdev_removal != NULL)
-               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+       if (spa->spa_vdev_removal != NULL) {
+               return (spa_vdev_exit(spa, NULL, txg,
+                   ZFS_ERR_DEVRM_IN_PROGRESS));
+       }
  
         if (oldvd == NULL)
                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
  
-       if (!oldvd->vdev_ops->vdev_op_leaf)
+       boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
+
+       if (raidz) {
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
+                       return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+               /*
+                * Can't expand a raidz while prior expand is in progress.
+                */
+               if (spa->spa_raidz_expand != NULL) {
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+               }
+       } else if (!oldvd->vdev_ops->vdev_op_leaf) {
                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+       }
  
-       pvd = oldvd->vdev_parent;
+       if (raidz)
+               pvd = oldvd;
+       else
+               pvd = oldvd->vdev_parent;
  
         if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
             VDEV_ALLOC_ATTACH) != 0)
@@ -6944,11 +7100,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
  
         if (!replacing) {
                 /*
-                * For attach, the only allowable parent is a mirror or the root
-                * vdev.
+                * For attach, the only allowable parent is a mirror or
+                * the root vdev. A raidz vdev can be attached to, but
+                * you cannot attach to a raidz child.
                  */
                 if (pvd->vdev_ops != &vdev_mirror_ops &&
-                   pvd->vdev_ops != &vdev_root_ops)
+                   pvd->vdev_ops != &vdev_root_ops &&
+                   !raidz)
                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
  
                 pvops = &vdev_mirror_ops;
@@ -6987,7 +7145,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         /*
          * Make sure the new device is big enough.
          */
-       if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+       vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+       if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
  
         /*
@@ -6997,32 +7156,75 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
  
+       /*
+        * RAIDZ-expansion-specific checks.
+        */
+       if (raidz) {
+               if (vdev_raidz_attach_check(newvd) != 0)
+                       return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+               /*
+                * Fail early if a child is not healthy or being replaced
+                */
+               for (int i = 0; i < oldvd->vdev_children; i++) {
+                       if (vdev_is_dead(oldvd->vdev_child[i]) ||
+                           !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
+                               return (spa_vdev_exit(spa, newrootvd, txg,
+                                   ENXIO));
+                       }
+                       /* Also fail if reserved boot area is in-use */
+                       if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
+                           != 0) {
+                               return (spa_vdev_exit(spa, newrootvd, txg,
+                                   EADDRINUSE));
+                       }
+               }
+       }
+
+       if (raidz) {
+               /*
+                * Note: oldvdpath is freed by spa_strfree(),  but
+                * kmem_asprintf() is freed by kmem_strfree(), so we have to
+                * move it to a spa_strdup-ed string.
+                */
+               char *tmp = kmem_asprintf("raidz%u-%u",
+                   (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
+               oldvdpath = spa_strdup(tmp);
+               kmem_strfree(tmp);
+       } else {
+               oldvdpath = spa_strdup(oldvd->vdev_path);
+       }
+       newvdpath = spa_strdup(newvd->vdev_path);
+
         /*
          * If this is an in-place replacement, update oldvd's path and devid
          * to make it distinguishable from newvd, and unopenable from now on.
          */
-       if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+       if (strcmp(oldvdpath, newvdpath) == 0) {
                 spa_strfree(oldvd->vdev_path);
-               oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+               oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
                     KM_SLEEP);
-               (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
-                   "%s/%s", newvd->vdev_path, "old");
+               (void) sprintf(oldvd->vdev_path, "%s/old",
+                   newvdpath);
                 if (oldvd->vdev_devid != NULL) {
                         spa_strfree(oldvd->vdev_devid);
                         oldvd->vdev_devid = NULL;
                 }
+               spa_strfree(oldvdpath);
+               oldvdpath = spa_strdup(oldvd->vdev_path);
         }
  
         /*
          * If the parent is not a mirror, or if we're replacing, insert the new
          * mirror/replacing/spare vdev above oldvd.
          */
-       if (pvd->vdev_ops != pvops)
+       if (!raidz && pvd->vdev_ops != pvops) {
                 pvd = vdev_add_parent(oldvd, pvops);
+               ASSERT(pvd->vdev_ops == pvops);
+               ASSERT(oldvd->vdev_parent == pvd);
+       }
  
         ASSERT(pvd->vdev_top->vdev_parent == rvd);
-       ASSERT(pvd->vdev_ops == pvops);
-       ASSERT(oldvd->vdev_parent == pvd);
  
         /*
          * Extract the new device from its root and add it to pvd.
@@ -7050,41 +7252,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
          */
         dtl_max_txg = txg + TXG_CONCURRENT_STATES;
  
-       vdev_dtl_dirty(newvd, DTL_MISSING,
-           TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+       if (raidz) {
+               /*
+                * Wait for the youngest allocations and frees to sync,
+                * and then wait for the deferral of those frees to finish.
+                */
+               spa_vdev_config_exit(spa, NULL,
+                   txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
  
-       if (newvd->vdev_isspare) {
-               spa_spare_activate(newvd);
-               spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
-       }
+               vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
+               vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
+               vdev_autotrim_stop_wait(tvd);
  
-       oldvdpath = spa_strdup(oldvd->vdev_path);
-       newvdpath = spa_strdup(newvd->vdev_path);
-       newvd_isspare = newvd->vdev_isspare;
+               dtl_max_txg = spa_vdev_config_enter(spa);
  
-       /*
-        * Mark newvd's DTL dirty in this txg.
-        */
-       vdev_dirty(tvd, VDD_DTL, newvd, txg);
+               tvd->vdev_rz_expanding = B_TRUE;
  
-       /*
-        * Schedule the resilver or rebuild to restart in the future. We do
-        * this to ensure that dmu_sync-ed blocks have been stitched into the
-        * respective datasets.
-        */
-       if (rebuild) {
-               newvd->vdev_rebuild_txg = txg;
+               vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
+               vdev_config_dirty(tvd);
  
-               vdev_rebuild(tvd);
+               dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+                   dtl_max_txg);
+               dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+                   newvd, tx);
+               dmu_tx_commit(tx);
         } else {
-               newvd->vdev_resilver_txg = txg;
+               vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+                   dtl_max_txg - TXG_INITIAL);
  
-               if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
-                   spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
-                       vdev_defer_resilver(newvd);
+               if (newvd->vdev_isspare) {
+                       spa_spare_activate(newvd);
+                       spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+               }
+
+               newvd_isspare = newvd->vdev_isspare;
+
+               /*
+                * Mark newvd's DTL dirty in this txg.
+                */
+               vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+               /*
+                * Schedule the resilver or rebuild to restart in the future.
+                * We do this to ensure that dmu_sync-ed blocks have been
+                * stitched into the respective datasets.
+                */
+               if (rebuild) {
+                       newvd->vdev_rebuild_txg = txg;
+
+                       vdev_rebuild(tvd);
                 } else {
-                       dsl_scan_restart_resilver(spa->spa_dsl_pool,
-                           dtl_max_txg);
+                       newvd->vdev_resilver_txg = txg;
+
+                       if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+                           spa_feature_is_enabled(spa,
+                           SPA_FEATURE_RESILVER_DEFER)) {
+                               vdev_defer_resilver(newvd);
+                       } else {
+                               dsl_scan_restart_resilver(spa->spa_dsl_pool,
+                                   dtl_max_txg);
+                       }
                 }
         }
  
@@ -7409,7 +7636,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
          */
         if (cmd_type == POOL_INITIALIZE_START &&
             (vd->vdev_initialize_thread != NULL ||
-           vd->vdev_top->vdev_removing)) {
+           vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
                 mutex_exit(&vd->vdev_initialize_lock);
                 return (SET_ERROR(EBUSY));
         } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
@@ -7421,6 +7648,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
             vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
                 mutex_exit(&vd->vdev_initialize_lock);
                 return (SET_ERROR(ESRCH));
+       } else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+           vd->vdev_initialize_thread != NULL) {
+               mutex_exit(&vd->vdev_initialize_lock);
+               return (SET_ERROR(EBUSY));
         }
  
         switch (cmd_type) {
@@ -7433,6 +7664,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
         case POOL_INITIALIZE_SUSPEND:
                 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
                 break;
+       case POOL_INITIALIZE_UNINIT:
+               vdev_uninitialize(vd);
+               break;
         default:
                 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
         }
@@ -7524,7 +7758,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
          * which has completed but the thread is not exited.
          */
         if (cmd_type == POOL_TRIM_START &&
-           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
+           vd->vdev_top->vdev_rz_expanding)) {
                 mutex_exit(&vd->vdev_trim_lock);
                 return (SET_ERROR(EBUSY));
         } else if (cmd_type == POOL_TRIM_CANCEL &&
@@ -8166,6 +8401,7 @@ spa_scan_stop(spa_t *spa)
         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
         if (dsl_scan_resilvering(spa->spa_dsl_pool))
                 return (SET_ERROR(EBUSY));
+
         return (dsl_scan_cancel(spa->spa_dsl_pool));
  }
  
@@ -8191,6 +8427,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
                 return (0);
         }
  
+       if (func == POOL_SCAN_ERRORSCRUB &&
+           !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+               return (SET_ERROR(ENOTSUP));
+
         return (dsl_scan(spa->spa_dsl_pool, func));
  }
  
@@ -8422,6 +8662,10 @@ spa_async_suspend(spa_t *spa)
         if (condense_thread != NULL)
                 zthr_cancel(condense_thread);
  
+       zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+       if (raidz_expand_thread != NULL)
+               zthr_cancel(raidz_expand_thread);
+
         zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
         if (discard_thread != NULL)
                 zthr_cancel(discard_thread);
@@ -8448,6 +8692,10 @@ spa_async_resume(spa_t *spa)
         if (condense_thread != NULL)
                 zthr_resume(condense_thread);
  
+       zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+       if (raidz_expand_thread != NULL)
+               zthr_resume(raidz_expand_thread);
+
         zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
         if (discard_thread != NULL)
                 zthr_resume(discard_thread);
@@ -8942,12 +9190,12 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                         }
  
                         /* normalize the property name */
-                       propname = zpool_prop_to_name(prop);
-                       proptype = zpool_prop_get_type(prop);
-                       if (prop == ZPOOL_PROP_INVAL &&
-                           zfs_prop_user(elemname)) {
+                       if (prop == ZPOOL_PROP_INVAL) {
                                 propname = elemname;
                                 proptype = PROP_TYPE_STRING;
+                       } else {
+                               propname = zpool_prop_to_name(prop);
+                               proptype = zpool_prop_get_type(prop);
                         }
  
                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
@@ -9242,6 +9490,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                 brt_sync(spa, txg);
                 ddt_sync(spa, txg);
                 dsl_scan_sync(dp, tx);
+               dsl_errorscrub_sync(dp, tx);
                 svr_sync(spa, tx);
                 spa_sync_upgrades(spa, tx);
  
@@ -9252,6 +9501,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                     != NULL)
                         vdev_sync(vd, txg);
  
+               if (pass == 1) {
+                       /*
+                        * dsl_pool_sync() -> dp_sync_tasks may have dirtied
+                        * the config. If that happens, this txg should not
+                        * be a no-op. So we must sync the config to the MOS
+                        * before checking for no-op.
+                        *
+                        * Note that when the config is dirty, it will
+                        * be written to the MOS (i.e. the MOS will be
+                        * dirtied) every time we call spa_sync_config_object()
+                        * in this txg.  Therefore we can't call this after
+                        * dsl_pool_sync() every pass, because it would
+                        * prevent us from converging, since we'd dirty
+                        * the MOS every pass.
+                        *
+                        * Sync tasks can only be processed in pass 1, so
+                        * there's no need to do this in later passes.
+                        */
+                       spa_sync_config_object(spa, tx);
+               }
+
                 /*
                  * Note: We need to check if the MOS is dirty because we could
                  * have marked the MOS dirty without updating the uberblock
@@ -9573,6 +9843,104 @@ spa_sync_allpools(void)
         mutex_exit(&spa_namespace_lock);
  }
  
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+       kthread_t **kthreads;
+
+       ASSERT(spa->spa_sync_tq == NULL);
+       ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+       /*
+        * - do not allow more allocators than cpus.
+        * - there may be more cpus than allocators.
+        * - do not allow more sync taskq threads than allocators or cpus.
+        */
+       int nthreads = spa->spa_alloc_count;
+       spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+           nthreads, KM_SLEEP);
+
+       spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+           nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+       VERIFY(spa->spa_sync_tq != NULL);
+       VERIFY(kthreads != NULL);
+
+       spa_taskqs_t *tqs =
+           &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+       spa_syncthread_info_t *ti = spa->spa_syncthreads;
+       for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+               ti->sti_thread = kthreads[i];
+               if (w == tqs->stqs_count) {
+                       w = 0;
+               }
+               ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+       }
+
+       kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+       return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+       ASSERT(spa->spa_sync_tq != NULL);
+
+       taskq_wait(spa->spa_sync_tq);
+       taskq_destroy(spa->spa_sync_tq);
+       kmem_free(spa->spa_syncthreads,
+           sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+       spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+       zbookmark_phys_t *bm = &zio->io_bookmark;
+       spa_t *spa = zio->io_spa;
+
+       ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+       /*
+        * A gang block (for example) may have inherited its parent's
+        * allocator, in which case there is nothing further to do here.
+        */
+       if (ZIO_HAS_ALLOCATOR(zio))
+               return;
+
+       ASSERT(spa != NULL);
+       ASSERT(bm != NULL);
+
+       /*
+        * First try to use an allocator assigned to the syncthread, and set
+        * the corresponding write issue taskq for the allocator.
+        * Note, we must have an open pool to do this.
+        */
+       if (spa->spa_sync_tq != NULL) {
+               spa_syncthread_info_t *ti = spa->spa_syncthreads;
+               for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+                       if (ti->sti_thread == curthread) {
+                               zio->io_allocator = i;
+                               zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+                               return;
+                       }
+               }
+       }
+
+       /*
+        * We want to try to use as many allocators as possible to help improve
+        * performance, but we also want logically adjacent IOs to be physically
+        * adjacent to improve sequential read performance. We chunk each object
+        * into 2^20 block regions, and then hash based on the objset, object,
+        * level, and region to accomplish both of these goals.
+        */
+       uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+           bm->zb_blkid >> 20);
+
+       zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+       zio->io_wr_iss_tq = NULL;
+}
+
  /*
   * ==========================================================================
   * Miscellaneous routines
@@ -9911,7 +10279,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                     DSS_SCANNING);
                 break;
         case ZPOOL_WAIT_RESILVER:
-               if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+               *in_progress = vdev_rebuild_active(spa->spa_root_vdev);
+               if (*in_progress)
                         break;
                 zfs_fallthrough;
         case ZPOOL_WAIT_SCRUB:
@@ -9926,6 +10295,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                     is_scrub == (activity == ZPOOL_WAIT_SCRUB));
                 break;
         }
+       case ZPOOL_WAIT_RAIDZ_EXPAND:
+       {
+               vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+               *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
+               break;
+       }
         default:
                 panic("unrecognized value for activity %d", activity);
         }
@@ -10116,6 +10491,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
  /* asynchronous event notification */
  EXPORT_SYMBOL(spa_event_notify);
  
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
+       "Percentage of CPUs to run a metaslab preload taskq");
+
  /* BEGIN CSTYLED */
  ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
         "log2 fraction of arc that can be used by inflight I/Os when "
@@ -10163,3 +10541,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
         "Whether extra ALLOC blkptrs were added to a livelist entry while it "
         "was being condensed");
  /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+       "Number of CPUs to run write issue taskqs");