spa: make read/write queues configurable

[mirror_zfs.git] / module / zfs / spa.c
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index 95b38f09825bc43ef0f4e90b9d3a9ec11606b935..a21b0decf6a3ccff565663965af155b49a65082c 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -33,6 +33,7 @@
   * Copyright 2017 Joyent, Inc.
   * Copyright (c) 2017, Intel Corporation.
   * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
   */
  
  /*
@@ -62,6 +63,7 @@
  #include <sys/vdev_rebuild.h>
  #include <sys/vdev_trim.h>
  #include <sys/vdev_disk.h>
+#include <sys/vdev_raidz.h>
  #include <sys/vdev_draid.h>
  #include <sys/metaslab.h>
  #include <sys/metaslab_impl.h>
@@ -98,6 +100,27 @@
  
  #include "zfs_prop.h"
  #include "zfs_comutil.h"
+#include <cityhash.h>
+
+/*
+ * spa_thread() existed on Illumos as a parent thread for the various worker
+ * threads that actually run the pool, as a way to both reference the entire
+ * pool work as a single object, and to share properties like scheduling
+ * options. It has not yet been adapted to Linux or FreeBSD. This define is
+ * used to mark related parts of the code to make things easier for the reader,
+ * and to compile this code out. It can be removed when someone implements it,
+ * moves it to some Illumos-specific place, or removes it entirely.
+ */
+#undef HAVE_SPA_THREAD
+
+/*
+ * The "System Duty Cycle" scheduling class is an Illumos feature to help
+ * prevent CPU-intensive kernel threads from affecting latency on interactive
+ * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
+ * gated behind a define. On Illumos SDC depends on spa_thread(), but
+ * spa_thread() also has other uses, so this is a separate define.
+ */
+#undef HAVE_SYSDC
  
  /*
   * The interval, in seconds, at which failed configuration cache file writes
@@ -107,16 +130,16 @@ int zfs_ccw_retry_interval = 300;
  
  typedef enum zti_modes {
         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
-       ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
         ZTI_MODE_SCALE,                 /* Taskqs scale with CPUs. */
+       ZTI_MODE_SYNC,                  /* sync thread assigned */
         ZTI_MODE_NULL,                  /* don't create a taskq */
         ZTI_NMODES
  } zti_modes_t;
  
  #define        ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  #define        ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define        ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  #define        ZTI_SCALE       { ZTI_MODE_SCALE, 0, 1 }
+#define        ZTI_SYNC        { ZTI_MODE_SYNC, 0, 1 }
  #define        ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
  
  #define        ZTI_N(n)        ZTI_P(n, 1)
@@ -137,24 +160,24 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
   * initializing a pool, we use this table to create an appropriately sized
   * taskq. Some operations are low volume and therefore have a small, static
   * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
   * macro causes us to create a taskq oriented for throughput. Some operations
   * are so high frequency and short-lived that the taskq itself can become a
   * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
   * additional degree of parallelism specified by the number of threads per-
   * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
   *
   * The different taskq priorities are to handle the different contexts (issue
   * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
   * need to be handled with minimum delay.
   */
-static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
         { ZTI_N(8),     ZTI_NULL,       ZTI_SCALE,      ZTI_NULL }, /* READ */
-       { ZTI_BATCH,    ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
+       { ZTI_SYNC,     ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
         { ZTI_SCALE,    ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
@@ -168,12 +191,24 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type,
      const char **ereport);
  static void spa_vdev_resilver_done(spa_t *spa);
  
+/*
+ * Percentage of all CPUs that can be used by the metaslab preload taskq.
+ */
+static uint_t metaslab_preload_pct = 50;
+
  static uint_t  zio_taskq_batch_pct = 80;         /* 1 thread per cpu in pset */
  static uint_t  zio_taskq_batch_tpq;              /* threads per taskq */
+
+#ifdef HAVE_SYSDC
  static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
  static const uint_t    zio_taskq_basedc = 80;    /* base duty cycle */
+#endif
  
+#ifdef HAVE_SPA_THREAD
  static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
+#endif
+
+static uint_t  zio_taskq_wr_iss_ncpus = 0;
  
  /*
   * Report any spa_load_verify errors found, but do not fail spa_load.
@@ -296,6 +331,22 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
         nvlist_free(propval);
  }
  
+/*
+ * Add a user property (source=src, propname=propval) to an nvlist.
+ */
+static void
+spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
+    zprop_source_t src)
+{
+       nvlist_t *propval;
+
+       VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+       VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+       VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+       nvlist_free(propval);
+}
+
  /*
   * Get property values from the spa configuration.
   */
@@ -471,7 +522,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
                 zprop_source_t src = ZPROP_SRC_DEFAULT;
                 zpool_prop_t prop;
  
-               if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
+               if ((prop = zpool_name_to_prop(za.za_name)) ==
+                   ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name))
                         continue;
  
                 switch (za.za_integer_length) {
@@ -514,7 +566,13 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
                                 kmem_free(strval, za.za_num_integers);
                                 break;
                         }
-                       spa_prop_add_list(*nvp, prop, strval, 0, src);
+                       if (prop != ZPOOL_PROP_INVAL) {
+                               spa_prop_add_list(*nvp, prop, strval, 0, src);
+                       } else {
+                               src = ZPROP_SRC_LOCAL;
+                               spa_prop_add_user(*nvp, za.za_name, strval,
+                                   src);
+                       }
                         kmem_free(strval, za.za_num_integers);
                         break;
  
@@ -556,36 +614,47 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
  
                 switch (prop) {
                 case ZPOOL_PROP_INVAL:
-                       if (!zpool_prop_feature(propname)) {
-                               error = SET_ERROR(EINVAL);
-                               break;
-                       }
-
                         /*
                          * Sanitize the input.
                          */
-                       if (nvpair_type(elem) != DATA_TYPE_UINT64) {
-                               error = SET_ERROR(EINVAL);
-                               break;
-                       }
+                       if (zfs_prop_user(propname)) {
+                               if (strlen(propname) >= ZAP_MAXNAMELEN) {
+                                       error = SET_ERROR(ENAMETOOLONG);
+                                       break;
+                               }
  
-                       if (nvpair_value_uint64(elem, &intval) != 0) {
-                               error = SET_ERROR(EINVAL);
-                               break;
-                       }
+                               if (strlen(fnvpair_value_string(elem)) >=
+                                   ZAP_MAXVALUELEN) {
+                                       error = SET_ERROR(E2BIG);
+                                       break;
+                               }
+                       } else if (zpool_prop_feature(propname)) {
+                               if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+                                       error = SET_ERROR(EINVAL);
+                                       break;
+                               }
  
-                       if (intval != 0) {
-                               error = SET_ERROR(EINVAL);
-                               break;
-                       }
+                               if (nvpair_value_uint64(elem, &intval) != 0) {
+                                       error = SET_ERROR(EINVAL);
+                                       break;
+                               }
  
-                       fname = strchr(propname, '@') + 1;
-                       if (zfeature_lookup_name(fname, NULL) != 0) {
+                               if (intval != 0) {
+                                       error = SET_ERROR(EINVAL);
+                                       break;
+                               }
+
+                               fname = strchr(propname, '@') + 1;
+                               if (zfeature_lookup_name(fname, NULL) != 0) {
+                                       error = SET_ERROR(EINVAL);
+                                       break;
+                               }
+
+                               has_feature = B_TRUE;
+                       } else {
                                 error = SET_ERROR(EINVAL);
                                 break;
                         }
-
-                       has_feature = B_TRUE;
                         break;
  
                 case ZPOOL_PROP_VERSION:
@@ -792,6 +861,12 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
                     prop == ZPOOL_PROP_READONLY)
                         continue;
  
+               if (prop == ZPOOL_PROP_INVAL &&
+                   zfs_prop_user(nvpair_name(elem))) {
+                       need_sync = B_TRUE;
+                       break;
+               }
+
                 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
                         uint64_t ver = 0;
  
@@ -983,17 +1058,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
         uint_t count = ztip->zti_count;
         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
         uint_t cpus, flags = TASKQ_DYNAMIC;
-       boolean_t batch = B_FALSE;
  
         switch (mode) {
         case ZTI_MODE_FIXED:
                 ASSERT3U(value, >, 0);
                 break;
  
-       case ZTI_MODE_BATCH:
-               batch = B_TRUE;
+       case ZTI_MODE_SYNC:
+
+               /*
+                * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+                * not to exceed the number of spa allocators.
+                */
+               if (zio_taskq_wr_iss_ncpus == 0) {
+                       count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+               } else {
+                       count = MAX(1,
+                           boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+               }
+               count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+               count = MIN(count, spa->spa_alloc_count);
+
+               /*
+                * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+                * single taskq may have more threads than 100% of online cpus.
+                */
+               value = (zio_taskq_batch_pct + count / 2) / count;
+               value = MIN(value, 100);
                 flags |= TASKQ_THREADS_CPU_PCT;
-               value = MIN(zio_taskq_batch_pct, 100);
                 break;
  
         case ZTI_MODE_SCALE:
@@ -1040,7 +1132,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  
         default:
                 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-                   "spa_activate()",
+                   "spa_taskqs_init()",
                     zio_type_name[t], zio_taskq_types[q], mode, value);
                 break;
         }
@@ -1060,14 +1152,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                         (void) snprintf(name, sizeof (name), "%s_%s",
                             zio_type_name[t], zio_taskq_types[q]);
  
+#ifdef HAVE_SYSDC
                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-                       if (batch)
-                               flags |= TASKQ_DC_BATCH;
-
                         (void) zio_taskq_basedc;
                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
                             spa->spa_proc, zio_taskq_basedc, flags);
                 } else {
+#endif
                         pri_t pri = maxclsyspri;
                         /*
                          * The write issue taskq can be extremely CPU
@@ -1093,7 +1184,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                         }
                         tq = taskq_create_proc(name, value, pri, 50,
                             INT_MAX, spa->spa_proc, flags);
+#ifdef HAVE_SYSDC
                 }
+#endif
  
                 tqs->stqs_taskq[i] = tq;
         }
@@ -1118,15 +1211,300 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
         tqs->stqs_taskq = NULL;
  }
  
+#ifdef _KERNEL
+/*
+ * The READ and WRITE rows of zio_taskqs are configurable at module load time
+ * by setting zio_taskq_read or zio_taskq_write.
+ *
+ * Example (the defaults for READ and WRITE)
+ *   zio_taskq_read='fixed,1,8 null scale null'
+ *   zio_taskq_write='sync fixed,1,5 scale fixed,1,5'
+ *
+ * Each sets the entire row at a time.
+ *
+ * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
+ * of threads per taskq.
+ *
+ * 'null' can only be set on the high-priority queues (queue selection for
+ * high-priority queues will fall back to the regular queue if the high-pri
+ * is NULL.
+ */
+static const char *const modes[ZTI_NMODES] = {
+       "fixed", "scale", "sync", "null"
+};
+
+/* Parse the incoming config string. Modifies cfg */
+static int
+spa_taskq_param_set(zio_type_t t, char *cfg)
+{
+       int err = 0;
+
+       zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
+
+       char *next = cfg, *tok, *c;
+
+       /*
+        * Parse out each element from the string and fill `row`. The entire
+        * row has to be set at once, so any errors are flagged by just
+        * breaking out of this loop early.
+        */
+       uint_t q;
+       for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
+               /* `next` is the start of the config */
+               if (next == NULL)
+                       break;
+
+               /* Eat up leading space */
+               while (isspace(*next))
+                       next++;
+               if (*next == '\0')
+                       break;
+
+               /* Mode ends at space or end of string */
+               tok = next;
+               next = strchr(tok, ' ');
+               if (next != NULL) *next++ = '\0';
+
+               /* Parameters start after a comma */
+               c = strchr(tok, ',');
+               if (c != NULL) *c++ = '\0';
+
+               /* Match mode string */
+               uint_t mode;
+               for (mode = 0; mode < ZTI_NMODES; mode++)
+                       if (strcmp(tok, modes[mode]) == 0)
+                               break;
+               if (mode == ZTI_NMODES)
+                       break;
+
+               /* Invalid canary */
+               row[q].zti_mode = ZTI_NMODES;
+
+               /* Per-mode setup */
+               switch (mode) {
+
+               /*
+                * FIXED is parameterised: number of queues, and number of
+                * threads per queue.
+                */
+               case ZTI_MODE_FIXED: {
+                       /* No parameters? */
+                       if (c == NULL || *c == '\0')
+                               break;
+
+                       /* Find next parameter */
+                       tok = c;
+                       c = strchr(tok, ',');
+                       if (c == NULL)
+                               break;
+
+                       /* Take digits and convert */
+                       unsigned long long nq;
+                       if (!(isdigit(*tok)))
+                               break;
+                       err = ddi_strtoull(tok, &tok, 10, &nq);
+                       /* Must succeed and also end at the next param sep */
+                       if (err != 0 || tok != c)
+                               break;
+
+                       /* Move past the comma */
+                       tok++;
+                       /* Need another number */
+                       if (!(isdigit(*tok)))
+                               break;
+                       /* Remember start to make sure we moved */
+                       c = tok;
+
+                       /* Take digits */
+                       unsigned long long ntpq;
+                       err = ddi_strtoull(tok, &tok, 10, &ntpq);
+                       /* Must succeed, and moved forward */
+                       if (err != 0 || tok == c || *tok != '\0')
+                               break;
+
+                       /*
+                        * sanity; zero queues/threads make no sense, and
+                        * 16K is almost certainly more than anyone will ever
+                        * need and avoids silly numbers like UINT32_MAX
+                        */
+                       if (nq == 0 || nq >= 16384 ||
+                           ntpq == 0 || ntpq >= 16384)
+                               break;
+
+                       const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
+                       row[q] = zti;
+                       break;
+               }
+
+               case ZTI_MODE_SCALE: {
+                       const zio_taskq_info_t zti = ZTI_SCALE;
+                       row[q] = zti;
+                       break;
+               }
+
+               case ZTI_MODE_SYNC: {
+                       const zio_taskq_info_t zti = ZTI_SYNC;
+                       row[q] = zti;
+                       break;
+               }
+
+               case ZTI_MODE_NULL: {
+                       /*
+                        * Can only null the high-priority queues; the general-
+                        * purpose ones have to exist.
+                        */
+                       if (q != ZIO_TASKQ_ISSUE_HIGH &&
+                           q != ZIO_TASKQ_INTERRUPT_HIGH)
+                               break;
+
+                       const zio_taskq_info_t zti = ZTI_NULL;
+                       row[q] = zti;
+                       break;
+               }
+
+               default:
+                       break;
+               }
+
+               /* Ensure we set a mode */
+               if (row[q].zti_mode == ZTI_NMODES)
+                       break;
+       }
+
+       /* Didn't get a full row, fail */
+       if (q < ZIO_TASKQ_TYPES)
+               return (SET_ERROR(EINVAL));
+
+       /* Eat trailing space */
+       if (next != NULL)
+               while (isspace(*next))
+                       next++;
+
+       /* If there's anything left over then fail */
+       if (next != NULL && *next != '\0')
+               return (SET_ERROR(EINVAL));
+
+       /* Success! Copy it into the real config */
+       for (q = 0; q < ZIO_TASKQ_TYPES; q++)
+               zio_taskqs[t][q] = row[q];
+
+       return (0);
+}
+
+static int
+spa_taskq_param_get(zio_type_t t, char *buf)
+{
+       int pos = 0;
+
+       /* Build paramater string from live config */
+       const char *sep = "";
+       for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
+               const zio_taskq_info_t *zti = &zio_taskqs[t][q];
+               if (zti->zti_mode == ZTI_MODE_FIXED)
+                       pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
+                           modes[zti->zti_mode], zti->zti_count,
+                           zti->zti_value);
+               else
+                       pos += sprintf(&buf[pos], "%s%s", sep,
+                           modes[zti->zti_mode]);
+               sep = " ";
+       }
+
+       buf[pos++] = '\n';
+       buf[pos] = '\0';
+
+       return (pos);
+}
+
+#ifdef __linux__
+static int
+spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+       char *cfg = kmem_strdup(val);
+       int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
+       kmem_free(cfg, strlen(val)+1);
+       return (-err);
+}
+static int
+spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+       return (spa_taskq_param_get(ZIO_TYPE_READ, buf));
+}
+
+static int
+spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+       char *cfg = kmem_strdup(val);
+       int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
+       kmem_free(cfg, strlen(val)+1);
+       return (-err);
+}
+static int
+spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+       return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf));
+}
+#else
+#include <sys/sbuf.h>
+
+/*
+ * On FreeBSD load-time parameters can be set up before malloc() is available,
+ * so we have to do all the parsing work on the stack.
+ */
+#define        SPA_TASKQ_PARAM_MAX     (128)
+
+static int
+spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
+{
+       char buf[SPA_TASKQ_PARAM_MAX];
+       int err = 0;
+
+       if (req->newptr == NULL) {
+               int len = spa_taskq_param_get(ZIO_TYPE_READ, buf);
+               struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req);
+               sbuf_cpy(s, buf);
+               err = sbuf_finish(s);
+               sbuf_delete(s);
+               return (err);
+       }
+
+       err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+       if (err)
+               return (err);
+       return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
+}
+
+static int
+spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
+{
+       char buf[SPA_TASKQ_PARAM_MAX];
+       int err = 0;
+
+       if (req->newptr == NULL) {
+               int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf);
+               struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req);
+               sbuf_cpy(s, buf);
+               err = sbuf_finish(s);
+               sbuf_delete(s);
+               return (err);
+       }
+
+       err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+       if (err)
+               return (err);
+       return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
+}
+#endif
+#endif /* _KERNEL */
+
  /*
   * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
   * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
   */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    zio_t *zio)
  {
         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
         taskq_t *tq;
@@ -1134,12 +1512,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
         ASSERT3P(tqs->stqs_taskq, !=, NULL);
         ASSERT3U(tqs->stqs_count, !=, 0);
  
+       if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+           (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+               /* dispatch to assigned write issue taskq */
+               tq = zio->io_wr_iss_tq;
+               return (tq);
+       }
+
         if (tqs->stqs_count == 1) {
                 tq = tqs->stqs_taskq[0];
         } else {
                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
         }
+       return (tq);
+}
  
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+    zio_t *zio)
+{
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
         taskq_dispatch_ent(tq, func, arg, flags, ent);
  }
  
@@ -1150,20 +1543,8 @@ void
  spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
      task_func_t *func, void *arg, uint_t flags)
  {
-       spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-       taskq_t *tq;
-       taskqid_t id;
-
-       ASSERT3P(tqs->stqs_taskq, !=, NULL);
-       ASSERT3U(tqs->stqs_count, !=, 0);
-
-       if (tqs->stqs_count == 1) {
-               tq = tqs->stqs_taskq[0];
-       } else {
-               tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
-       }
-
-       id = taskq_dispatch(tq, func, arg, flags);
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+       taskqid_t id = taskq_dispatch(tq, func, arg, flags);
         if (id)
                 taskq_wait_id(tq, id);
  }
@@ -1178,11 +1559,6 @@ spa_create_zio_taskqs(spa_t *spa)
         }
  }
  
-/*
- * Disabled until spa_thread() can be adapted for Linux.
- */
-#undef HAVE_SPA_THREAD
-
  #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
  static void
  spa_thread(void *arg)
@@ -1223,9 +1599,11 @@ spa_thread(void *arg)
                 pool_unlock();
         }
  
+#ifdef HAVE_SYSDC
         if (zio_taskq_sysdc) {
                 sysdc_thread_enter(curthread, 100, 0);
         }
+#endif
  
         spa->spa_proc = curproc;
         spa->spa_did = curthread->t_did;
@@ -1254,24 +1632,26 @@ spa_thread(void *arg)
  }
  #endif
  
+extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
  /*
   * Activate an uninitialized pool.
   */
  static void
  spa_activate(spa_t *spa, spa_mode_t mode)
  {
+       metaslab_ops_t *msp = metaslab_allocator(spa);
         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
  
         spa->spa_state = POOL_STATE_ACTIVE;
         spa->spa_mode = mode;
         spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
  
-       spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_embedded_log_class =
-           metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops);
-       spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops);
+       spa->spa_normal_class = metaslab_class_create(spa, msp);
+       spa->spa_log_class = metaslab_class_create(spa, msp);
+       spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
+       spa->spa_special_class = metaslab_class_create(spa, msp);
+       spa->spa_dedup_class = metaslab_class_create(spa, msp);
  
         /* Try to create a covering process */
         mutex_enter(&spa->spa_proc_lock);
@@ -1279,7 +1659,6 @@ spa_activate(spa_t *spa, spa_mode_t mode)
         ASSERT(spa->spa_proc == &p0);
         spa->spa_did = 0;
  
-       (void) spa_create_process;
  #ifdef HAVE_SPA_THREAD
         /* Only create a process if we're going to be around a while. */
         if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
@@ -1356,6 +1735,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
         spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
             1, INT_MAX, 0);
  
+       /*
+        * The taskq to preload metaslabs.
+        */
+       spa->spa_metaslab_taskq = taskq_create("z_metaslab",
+           metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
+           TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
         /*
          * Taskq dedicated to prefetcher threads: this is used to prevent the
          * pool traverse code from monopolizing the global (and limited)
@@ -1391,6 +1777,11 @@ spa_deactivate(spa_t *spa)
                 spa->spa_zvol_taskq = NULL;
         }
  
+       if (spa->spa_metaslab_taskq) {
+               taskq_destroy(spa->spa_metaslab_taskq);
+               spa->spa_metaslab_taskq = NULL;
+       }
+
         if (spa->spa_prefetch_taskq) {
                 taskq_destroy(spa->spa_prefetch_taskq);
                 spa->spa_prefetch_taskq = NULL;
@@ -1568,16 +1959,16 @@ spa_unload_log_sm_metadata(spa_t *spa)
  {
         void *cookie = NULL;
         spa_log_sm_t *sls;
+       log_summary_entry_t *e;
+
         while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
             &cookie)) != NULL) {
                 VERIFY0(sls->sls_mscount);
                 kmem_free(sls, sizeof (spa_log_sm_t));
         }
  
-       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
-           e != NULL; e = list_head(&spa->spa_log_summary)) {
+       while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
                 VERIFY0(e->lse_mscount);
-               list_remove(&spa->spa_log_summary, e);
                 kmem_free(e, sizeof (log_summary_entry_t));
         }
  
@@ -1605,6 +1996,10 @@ spa_destroy_aux_threads(spa_t *spa)
                 zthr_destroy(spa->spa_livelist_condense_zthr);
                 spa->spa_livelist_condense_zthr = NULL;
         }
+       if (spa->spa_raidz_expand_zthr != NULL) {
+               zthr_destroy(spa->spa_raidz_expand_zthr);
+               spa->spa_raidz_expand_zthr = NULL;
+       }
  }
  
  /*
@@ -1663,13 +2058,7 @@ spa_unload(spa_t *spa)
          * This ensures that there is no async metaslab prefetching
          * while we attempt to unload the spa.
          */
-       if (spa->spa_root_vdev != NULL) {
-               for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
-                       vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
-                       if (vc->vdev_mg != NULL)
-                               taskq_wait(vc->vdev_mg->mg_taskq);
-               }
-       }
+       taskq_wait(spa->spa_metaslab_taskq);
  
         if (spa->spa_mmp.mmp_thread)
                 mmp_thread_stop(spa);
@@ -1763,6 +2152,8 @@ spa_unload(spa_t *spa)
                 spa->spa_compatibility = NULL;
         }
  
+       spa->spa_raidz_expand = NULL;
+
         spa_config_exit(spa, SCL_ALL, spa);
  }
  
@@ -2347,7 +2738,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
          * When damaged consider it to be a metadata error since we cannot
          * trust the BP_GET_TYPE and BP_GET_LEVEL values.
          */
-       if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
+       if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
                 atomic_inc_64(&sle->sle_meta_count);
                 return (0);
         }
@@ -2901,6 +3292,7 @@ spa_spawn_aux_threads(spa_t *spa)
  
         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  
+       spa_start_raidz_expansion_thread(spa);
         spa_start_indirect_condensing_thread(spa);
         spa_start_livelist_destroy_thread(spa);
         spa_start_livelist_condensing_thread(spa);
@@ -3003,6 +3395,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
         spa->spa_load_state = state;
         (void) spa_import_progress_set_state(spa_guid(spa),
             spa_load_state(spa));
+       spa_import_progress_set_notes(spa, "spa_load()");
  
         gethrestime(&spa->spa_loaded_ts);
         error = spa_load_impl(spa, type, &ereport);
@@ -3044,6 +3437,12 @@ vdev_count_verify_zaps(vdev_t *vd)
         spa_t *spa = vd->vdev_spa;
         uint64_t total = 0;
  
+       if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
+           vd->vdev_root_zap != 0) {
+               total++;
+               ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, vd->vdev_root_zap));
+       }
         if (vd->vdev_top_zap != 0) {
                 total++;
                 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
@@ -3225,7 +3624,7 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
         uint64_t mmp_config = ub->ub_mmp_config;
         uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
         uint64_t import_delay;
-       hrtime_t import_expire;
+       hrtime_t import_expire, now;
         nvlist_t *mmp_label = NULL;
         vdev_t *rvd = spa->spa_root_vdev;
         kcondvar_t cv;
@@ -3263,7 +3662,17 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
  
         import_expire = gethrtime() + import_delay;
  
-       while (gethrtime() < import_expire) {
+       spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
+           "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+
+       int interations = 0;
+       while ((now = gethrtime()) < import_expire) {
+               if (interations++ % 30 == 0) {
+                       spa_import_progress_set_notes(spa, "Checking MMP "
+                           "activity, %llu ms remaining",
+                           (u_longlong_t)NSEC2MSEC(import_expire - now));
+               }
+
                 (void) spa_import_progress_set_mmp_check(spa_guid(spa),
                     NSEC2SEC(import_expire - gethrtime()));
  
@@ -3649,6 +4058,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
         }
         spa_load_note(spa, "using uberblock with txg=%llu",
             (u_longlong_t)ub->ub_txg);
+       if (ub->ub_raidz_reflow_info != 0) {
+               spa_load_note(spa, "uberblock raidz_reflow_info: "
+                   "state=%u offset=%llu",
+                   (int)RRSS_GET_STATE(ub),
+                   (u_longlong_t)RRSS_GET_OFFSET(ub));
+       }
  
  
         /*
@@ -3873,6 +4288,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
         rvd = mrvd;
         spa_config_exit(spa, SCL_ALL, FTAG);
  
+       /*
+        * If 'zpool import' used a cached config, then the on-disk hostid and
+        * hostname may be different to the cached config in ways that should
+        * prevent import.  Userspace can't discover this without a scan, but
+        * we know, so we add these values to LOAD_INFO so the caller can know
+        * the difference.
+        *
+        * Note that we have to do this before the config is regenerated,
+        * because the new config will have the hostid and hostname for this
+        * host, in readiness for import.
+        */
+       if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
+               fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
+                   fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
+       if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
+               fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
+                   fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
+
         /*
          * We will use spa_config if we decide to reload the spa or if spa_load
          * fails and we rewind. We must thus regenerate the config using the
@@ -4859,6 +5292,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         /*
          * Retrieve the checkpoint txg if the pool has a checkpoint.
          */
+       spa_import_progress_set_notes(spa, "Loading checkpoint txg");
         error = spa_ld_read_checkpoint_txg(spa);
         if (error != 0)
                 return (error);
@@ -4871,6 +5305,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * initiated. Otherwise we could be reading from indirect vdevs before
          * we have loaded their mappings.
          */
+       spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
         error = spa_ld_open_indirect_vdev_metadata(spa);
         if (error != 0)
                 return (error);
@@ -4879,6 +5314,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Retrieve the full list of active features from the MOS and check if
          * they are all supported.
          */
+       spa_import_progress_set_notes(spa, "Checking feature flags");
         error = spa_ld_check_features(spa, &missing_feat_write);
         if (error != 0)
                 return (error);
@@ -4887,6 +5323,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Load several special directories from the MOS needed by the dsl_pool
          * layer.
          */
+       spa_import_progress_set_notes(spa, "Loading special MOS directories");
         error = spa_ld_load_special_directories(spa);
         if (error != 0)
                 return (error);
@@ -4894,6 +5331,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         /*
          * Retrieve pool properties from the MOS.
          */
+       spa_import_progress_set_notes(spa, "Loading properties");
         error = spa_ld_get_props(spa);
         if (error != 0)
                 return (error);
@@ -4902,6 +5340,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Retrieve the list of auxiliary devices - cache devices and spares -
          * and open them.
          */
+       spa_import_progress_set_notes(spa, "Loading AUX vdevs");
         error = spa_ld_open_aux_vdevs(spa, type);
         if (error != 0)
                 return (error);
@@ -4910,14 +5349,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Load the metadata for all vdevs. Also check if unopenable devices
          * should be autoreplaced.
          */
+       spa_import_progress_set_notes(spa, "Loading vdev metadata");
         error = spa_ld_load_vdev_metadata(spa);
         if (error != 0)
                 return (error);
  
+       spa_import_progress_set_notes(spa, "Loading dedup tables");
         error = spa_ld_load_dedup_tables(spa);
         if (error != 0)
                 return (error);
  
+       spa_import_progress_set_notes(spa, "Loading BRT");
         error = spa_ld_load_brt(spa);
         if (error != 0)
                 return (error);
@@ -4926,6 +5368,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * Verify the logs now to make sure we don't have any unexpected errors
          * when we claim log blocks later.
          */
+       spa_import_progress_set_notes(spa, "Verifying Log Devices");
         error = spa_ld_verify_logs(spa, type, ereport);
         if (error != 0)
                 return (error);
@@ -4947,6 +5390,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * state. When performing an extreme rewind, we verify the whole pool,
          * which can take a very long time.
          */
+       spa_import_progress_set_notes(spa, "Verifying pool data");
         error = spa_ld_verify_pool_data(spa);
         if (error != 0)
                 return (error);
@@ -4956,6 +5400,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * we write anything to the pool because we'd need to update the space
          * accounting using the deflated sizes.
          */
+       spa_import_progress_set_notes(spa, "Calculating deflated space");
         spa_update_dspace(spa);
  
         /*
@@ -4963,12 +5408,20 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
          * pool. If we are importing the pool in read-write mode, a few
          * additional steps must be performed to finish the import.
          */
+       spa_import_progress_set_notes(spa, "Starting import");
         if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
             spa->spa_load_max_txg == UINT64_MAX)) {
                 uint64_t config_cache_txg = spa->spa_config_txg;
  
                 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
  
+               /*
+                * Before we do any zio_write's, complete the raidz expansion
+                * scratch space copying, if necessary.
+                */
+               if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
+                       vdev_raidz_reflow_copy_scratch(spa);
+
                 /*
                  * In case of a checkpoint rewind, log the original txg
                  * of the checkpointed uberblock.
@@ -4979,6 +5432,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                             (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
                 }
  
+               spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
                 /*
                  * Traverse the ZIL and claim all blocks.
                  */
@@ -4998,6 +5452,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * will have been set for us by ZIL traversal operations
                  * performed above.
                  */
+               spa_import_progress_set_notes(spa, "Syncing ZIL claims");
                 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
  
                 /*
@@ -5005,6 +5460,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * next sync, we would update the config stored in vdev labels
                  * and the cachefile (by default /etc/zfs/zpool.cache).
                  */
+               spa_import_progress_set_notes(spa, "Updating configs");
                 spa_ld_check_for_config_update(spa, config_cache_txg,
                     update_config_cache);
  
@@ -5013,6 +5469,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * Then check all DTLs to see if anything needs resilvering.
                  * The resilver will be deferred if a rebuild was started.
                  */
+               spa_import_progress_set_notes(spa, "Starting resilvers");
                 if (vdev_rebuild_active(spa->spa_root_vdev)) {
                         vdev_rebuild_restart(spa);
                 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@@ -5026,6 +5483,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  */
                 spa_history_log_version(spa, "open", NULL);
  
+               spa_import_progress_set_notes(spa,
+                   "Restarting device removals");
                 spa_restart_removal(spa);
                 spa_spawn_aux_threads(spa);
  
@@ -5038,19 +5497,26 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
                  * auxiliary threads above (from which the livelist
                  * deletion zthr is part of).
                  */
+               spa_import_progress_set_notes(spa,
+                   "Cleaning up inconsistent objsets");
                 (void) dmu_objset_find(spa_name(spa),
                     dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
  
                 /*
                  * Clean up any stale temporary dataset userrefs.
                  */
+               spa_import_progress_set_notes(spa,
+                   "Cleaning up temporary userrefs");
                 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
  
                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               spa_import_progress_set_notes(spa, "Restarting initialize");
                 vdev_initialize_restart(spa->spa_root_vdev);
+               spa_import_progress_set_notes(spa, "Restarting TRIM");
                 vdev_trim_restart(spa->spa_root_vdev);
                 vdev_autotrim_restart(spa);
                 spa_config_exit(spa, SCL_CONFIG, FTAG);
+               spa_import_progress_set_notes(spa, "Finished importing");
         }
  
         spa_import_progress_remove(spa_guid(spa));
@@ -5393,13 +5859,15 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
                 for (i = 0; i < nspares; i++) {
                         guid = fnvlist_lookup_uint64(spares[i],
                             ZPOOL_CONFIG_GUID);
+                       VERIFY0(nvlist_lookup_uint64_array(spares[i],
+                           ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
                         if (spa_spare_exists(guid, &pool, NULL) &&
                             pool != 0ULL) {
-                               VERIFY0(nvlist_lookup_uint64_array(spares[i],
-                                   ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs,
-                                   &vsc));
                                 vs->vs_state = VDEV_STATE_CANT_OPEN;
                                 vs->vs_aux = VDEV_AUX_SPARED;
+                       } else {
+                               vs->vs_state =
+                                   spa->spa_spares.sav_vdevs[i]->vdev_state;
                         }
                 }
         }
@@ -6330,6 +6798,16 @@ spa_tryimport(nvlist_t *tryconfig)
                 spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
         }
  
+       /*
+        * spa_import() relies on a pool config fetched by spa_try_import()
+        * for spare/cache devices. Import flags are not passed to
+        * spa_tryimport(), which makes it return early due to a missing log
+        * device and missing retrieving the cache device and spare eventually.
+        * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
+        * the correct configuration regardless of the missing log device.
+        */
+       spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
+
         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
  
         /*
@@ -6771,9 +7249,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  }
  
  /*
- * Attach a device to a mirror.  The arguments are the path to any device
- * in the mirror, and the nvroot for the new device.  If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
+ * Attach a device to a vdev specified by its guid.  The vdev type can be
+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
+ * single device). When the vdev is a single device, a mirror vdev will be
+ * automatically inserted.
   *
   * If 'replacing' is specified, the new device is intended to replace the
   * existing device; in this case the two devices are made into their own
@@ -6796,7 +7275,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
         vdev_ops_t *pvops;
         char *oldvdpath, *newvdpath;
-       int newvd_isspare;
+       int newvd_isspare = B_FALSE;
         int error;
  
         ASSERT(spa_writeable(spa));
@@ -6816,25 +7295,46 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
                 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
                         return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
  
-               if (dsl_scan_resilvering(spa_get_dsl(spa)))
+               if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
+                   dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
                         return (spa_vdev_exit(spa, NULL, txg,
                             ZFS_ERR_RESILVER_IN_PROGRESS));
+               }
         } else {
                 if (vdev_rebuild_active(rvd))
                         return (spa_vdev_exit(spa, NULL, txg,
                             ZFS_ERR_REBUILD_IN_PROGRESS));
         }
  
-       if (spa->spa_vdev_removal != NULL)
-               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+       if (spa->spa_vdev_removal != NULL) {
+               return (spa_vdev_exit(spa, NULL, txg,
+                   ZFS_ERR_DEVRM_IN_PROGRESS));
+       }
  
         if (oldvd == NULL)
                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
  
-       if (!oldvd->vdev_ops->vdev_op_leaf)
+       boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
+
+       if (raidz) {
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
+                       return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+               /*
+                * Can't expand a raidz while prior expand is in progress.
+                */
+               if (spa->spa_raidz_expand != NULL) {
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+               }
+       } else if (!oldvd->vdev_ops->vdev_op_leaf) {
                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+       }
  
-       pvd = oldvd->vdev_parent;
+       if (raidz)
+               pvd = oldvd;
+       else
+               pvd = oldvd->vdev_parent;
  
         if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
             VDEV_ALLOC_ATTACH) != 0)
@@ -6886,11 +7386,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
  
         if (!replacing) {
                 /*
-                * For attach, the only allowable parent is a mirror or the root
-                * vdev.
+                * For attach, the only allowable parent is a mirror or
+                * the root vdev. A raidz vdev can be attached to, but
+                * you cannot attach to a raidz child.
                  */
                 if (pvd->vdev_ops != &vdev_mirror_ops &&
-                   pvd->vdev_ops != &vdev_root_ops)
+                   pvd->vdev_ops != &vdev_root_ops &&
+                   !raidz)
                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
  
                 pvops = &vdev_mirror_ops;
@@ -6929,7 +7431,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         /*
          * Make sure the new device is big enough.
          */
-       if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+       vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+       if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
  
         /*
@@ -6939,32 +7442,75 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
  
+       /*
+        * RAIDZ-expansion-specific checks.
+        */
+       if (raidz) {
+               if (vdev_raidz_attach_check(newvd) != 0)
+                       return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+               /*
+                * Fail early if a child is not healthy or being replaced
+                */
+               for (int i = 0; i < oldvd->vdev_children; i++) {
+                       if (vdev_is_dead(oldvd->vdev_child[i]) ||
+                           !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
+                               return (spa_vdev_exit(spa, newrootvd, txg,
+                                   ENXIO));
+                       }
+                       /* Also fail if reserved boot area is in-use */
+                       if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
+                           != 0) {
+                               return (spa_vdev_exit(spa, newrootvd, txg,
+                                   EADDRINUSE));
+                       }
+               }
+       }
+
+       if (raidz) {
+               /*
+                * Note: oldvdpath is freed by spa_strfree(),  but
+                * kmem_asprintf() is freed by kmem_strfree(), so we have to
+                * move it to a spa_strdup-ed string.
+                */
+               char *tmp = kmem_asprintf("raidz%u-%u",
+                   (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
+               oldvdpath = spa_strdup(tmp);
+               kmem_strfree(tmp);
+       } else {
+               oldvdpath = spa_strdup(oldvd->vdev_path);
+       }
+       newvdpath = spa_strdup(newvd->vdev_path);
+
         /*
          * If this is an in-place replacement, update oldvd's path and devid
          * to make it distinguishable from newvd, and unopenable from now on.
          */
-       if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+       if (strcmp(oldvdpath, newvdpath) == 0) {
                 spa_strfree(oldvd->vdev_path);
-               oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+               oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
                     KM_SLEEP);
-               (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
-                   "%s/%s", newvd->vdev_path, "old");
+               (void) sprintf(oldvd->vdev_path, "%s/old",
+                   newvdpath);
                 if (oldvd->vdev_devid != NULL) {
                         spa_strfree(oldvd->vdev_devid);
                         oldvd->vdev_devid = NULL;
                 }
+               spa_strfree(oldvdpath);
+               oldvdpath = spa_strdup(oldvd->vdev_path);
         }
  
         /*
          * If the parent is not a mirror, or if we're replacing, insert the new
          * mirror/replacing/spare vdev above oldvd.
          */
-       if (pvd->vdev_ops != pvops)
+       if (!raidz && pvd->vdev_ops != pvops) {
                 pvd = vdev_add_parent(oldvd, pvops);
+               ASSERT(pvd->vdev_ops == pvops);
+               ASSERT(oldvd->vdev_parent == pvd);
+       }
  
         ASSERT(pvd->vdev_top->vdev_parent == rvd);
-       ASSERT(pvd->vdev_ops == pvops);
-       ASSERT(oldvd->vdev_parent == pvd);
  
         /*
          * Extract the new device from its root and add it to pvd.
@@ -6992,41 +7538,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
          */
         dtl_max_txg = txg + TXG_CONCURRENT_STATES;
  
-       vdev_dtl_dirty(newvd, DTL_MISSING,
-           TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+       if (raidz) {
+               /*
+                * Wait for the youngest allocations and frees to sync,
+                * and then wait for the deferral of those frees to finish.
+                */
+               spa_vdev_config_exit(spa, NULL,
+                   txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
  
-       if (newvd->vdev_isspare) {
-               spa_spare_activate(newvd);
-               spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
-       }
+               vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
+               vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
+               vdev_autotrim_stop_wait(tvd);
  
-       oldvdpath = spa_strdup(oldvd->vdev_path);
-       newvdpath = spa_strdup(newvd->vdev_path);
-       newvd_isspare = newvd->vdev_isspare;
+               dtl_max_txg = spa_vdev_config_enter(spa);
  
-       /*
-        * Mark newvd's DTL dirty in this txg.
-        */
-       vdev_dirty(tvd, VDD_DTL, newvd, txg);
+               tvd->vdev_rz_expanding = B_TRUE;
  
-       /*
-        * Schedule the resilver or rebuild to restart in the future. We do
-        * this to ensure that dmu_sync-ed blocks have been stitched into the
-        * respective datasets.
-        */
-       if (rebuild) {
-               newvd->vdev_rebuild_txg = txg;
+               vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
+               vdev_config_dirty(tvd);
  
-               vdev_rebuild(tvd);
+               dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+                   dtl_max_txg);
+               dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+                   newvd, tx);
+               dmu_tx_commit(tx);
         } else {
-               newvd->vdev_resilver_txg = txg;
+               vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+                   dtl_max_txg - TXG_INITIAL);
+
+               if (newvd->vdev_isspare) {
+                       spa_spare_activate(newvd);
+                       spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+               }
+
+               newvd_isspare = newvd->vdev_isspare;
+
+               /*
+                * Mark newvd's DTL dirty in this txg.
+                */
+               vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+               /*
+                * Schedule the resilver or rebuild to restart in the future.
+                * We do this to ensure that dmu_sync-ed blocks have been
+                * stitched into the respective datasets.
+                */
+               if (rebuild) {
+                       newvd->vdev_rebuild_txg = txg;
  
-               if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
-                   spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
-                       vdev_defer_resilver(newvd);
+                       vdev_rebuild(tvd);
                 } else {
-                       dsl_scan_restart_resilver(spa->spa_dsl_pool,
-                           dtl_max_txg);
+                       newvd->vdev_resilver_txg = txg;
+
+                       if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+                           spa_feature_is_enabled(spa,
+                           SPA_FEATURE_RESILVER_DEFER)) {
+                               vdev_defer_resilver(newvd);
+                       } else {
+                               dsl_scan_restart_resilver(spa->spa_dsl_pool,
+                                   dtl_max_txg);
+                       }
                 }
         }
  
@@ -7056,7 +7627,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
   * Detach a device from a mirror or replacing vdev.
   *
   * If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
+ * is a replacing or a spare vdev.
   */
  int
  spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
@@ -7351,7 +7922,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
          */
         if (cmd_type == POOL_INITIALIZE_START &&
             (vd->vdev_initialize_thread != NULL ||
-           vd->vdev_top->vdev_removing)) {
+           vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
                 mutex_exit(&vd->vdev_initialize_lock);
                 return (SET_ERROR(EBUSY));
         } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
@@ -7363,6 +7934,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
             vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
                 mutex_exit(&vd->vdev_initialize_lock);
                 return (SET_ERROR(ESRCH));
+       } else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+           vd->vdev_initialize_thread != NULL) {
+               mutex_exit(&vd->vdev_initialize_lock);
+               return (SET_ERROR(EBUSY));
         }
  
         switch (cmd_type) {
@@ -7375,6 +7950,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
         case POOL_INITIALIZE_SUSPEND:
                 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
                 break;
+       case POOL_INITIALIZE_UNINIT:
+               vdev_uninitialize(vd);
+               break;
         default:
                 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
         }
@@ -7466,7 +8044,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
          * which has completed but the thread is not exited.
          */
         if (cmd_type == POOL_TRIM_START &&
-           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
+           vd->vdev_top->vdev_rz_expanding)) {
                 mutex_exit(&vd->vdev_trim_lock);
                 return (SET_ERROR(EBUSY));
         } else if (cmd_type == POOL_TRIM_CANCEL &&
@@ -8108,6 +8687,7 @@ spa_scan_stop(spa_t *spa)
         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
         if (dsl_scan_resilvering(spa->spa_dsl_pool))
                 return (SET_ERROR(EBUSY));
+
         return (dsl_scan_cancel(spa->spa_dsl_pool));
  }
  
@@ -8133,6 +8713,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
                 return (0);
         }
  
+       if (func == POOL_SCAN_ERRORSCRUB &&
+           !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+               return (SET_ERROR(ENOTSUP));
+
         return (dsl_scan(spa->spa_dsl_pool, func));
  }
  
@@ -8279,7 +8863,8 @@ spa_async_thread(void *arg)
          * If any devices are done replacing, detach them.
          */
         if (tasks & SPA_ASYNC_RESILVER_DONE ||
-           tasks & SPA_ASYNC_REBUILD_DONE) {
+           tasks & SPA_ASYNC_REBUILD_DONE ||
+           tasks & SPA_ASYNC_DETACH_SPARE) {
                 spa_vdev_resilver_done(spa);
         }
  
@@ -8363,6 +8948,10 @@ spa_async_suspend(spa_t *spa)
         if (condense_thread != NULL)
                 zthr_cancel(condense_thread);
  
+       zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+       if (raidz_expand_thread != NULL)
+               zthr_cancel(raidz_expand_thread);
+
         zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
         if (discard_thread != NULL)
                 zthr_cancel(discard_thread);
@@ -8389,6 +8978,10 @@ spa_async_resume(spa_t *spa)
         if (condense_thread != NULL)
                 zthr_resume(condense_thread);
  
+       zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+       if (raidz_expand_thread != NULL)
+               zthr_resume(raidz_expand_thread);
+
         zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
         if (discard_thread != NULL)
                 zthr_resume(discard_thread);
@@ -8623,6 +9216,11 @@ spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
  {
         spa_t *spa = vd->vdev_spa;
  
+       if (vd->vdev_root_zap != 0 &&
+           spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
+               VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+                   vd->vdev_root_zap, tx));
+       }
         if (vd->vdev_top_zap != 0) {
                 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
                     vd->vdev_top_zap, tx));
@@ -8786,24 +9384,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                 const char *strval, *fname;
                 zpool_prop_t prop;
                 const char *propname;
+               const char *elemname = nvpair_name(elem);
                 zprop_type_t proptype;
                 spa_feature_t fid;
  
-               switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
-               case ZPOOL_PROP_INVAL:
-                       /*
-                        * We checked this earlier in spa_prop_validate().
-                        */
-                       ASSERT(zpool_prop_feature(nvpair_name(elem)));
-
-                       fname = strchr(nvpair_name(elem), '@') + 1;
-                       VERIFY0(zfeature_lookup_name(fname, &fid));
-
-                       spa_feature_enable(spa, fid, tx);
-                       spa_history_log_internal(spa, "set", tx,
-                           "%s=enabled", nvpair_name(elem));
-                       break;
-
+               switch (prop = zpool_name_to_prop(elemname)) {
                 case ZPOOL_PROP_VERSION:
                         intval = fnvpair_value_uint64(elem);
                         /*
@@ -8846,7 +9431,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
                         }
                         spa_history_log_internal(spa, "set", tx,
-                           "%s=%s", nvpair_name(elem), strval);
+                           "%s=%s", elemname, strval);
                         break;
                 case ZPOOL_PROP_COMPATIBILITY:
                         strval = fnvpair_value_string(elem);
@@ -8865,6 +9450,20 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                             "%s=%s", nvpair_name(elem), strval);
                         break;
  
+               case ZPOOL_PROP_INVAL:
+                       if (zpool_prop_feature(elemname)) {
+                               fname = strchr(elemname, '@') + 1;
+                               VERIFY0(zfeature_lookup_name(fname, &fid));
+
+                               spa_feature_enable(spa, fid, tx);
+                               spa_history_log_internal(spa, "set", tx,
+                                   "%s=enabled", elemname);
+                               break;
+                       } else if (!zfs_prop_user(elemname)) {
+                               ASSERT(zpool_prop_feature(elemname));
+                               break;
+                       }
+                       zfs_fallthrough;
                 default:
                         /*
                          * Set pool property values in the poolprops mos object.
@@ -8877,8 +9476,13 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                         }
  
                         /* normalize the property name */
-                       propname = zpool_prop_to_name(prop);
-                       proptype = zpool_prop_get_type(prop);
+                       if (prop == ZPOOL_PROP_INVAL) {
+                               propname = elemname;
+                               proptype = PROP_TYPE_STRING;
+                       } else {
+                               propname = zpool_prop_to_name(prop);
+                               proptype = zpool_prop_get_type(prop);
+                       }
  
                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
                                 ASSERT(proptype == PROP_TYPE_STRING);
@@ -8887,7 +9491,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                                     spa->spa_pool_props_object, propname,
                                     1, strlen(strval) + 1, strval, tx));
                                 spa_history_log_internal(spa, "set", tx,
-                                   "%s=%s", nvpair_name(elem), strval);
+                                   "%s=%s", elemname, strval);
                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
                                 intval = fnvpair_value_uint64(elem);
  
@@ -8900,7 +9504,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                                     spa->spa_pool_props_object, propname,
                                     8, 1, &intval, tx));
                                 spa_history_log_internal(spa, "set", tx,
-                                   "%s=%lld", nvpair_name(elem),
+                                   "%s=%lld", elemname,
                                     (longlong_t)intval);
  
                                 switch (prop) {
@@ -9172,6 +9776,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                 brt_sync(spa, txg);
                 ddt_sync(spa, txg);
                 dsl_scan_sync(dp, tx);
+               dsl_errorscrub_sync(dp, tx);
                 svr_sync(spa, tx);
                 spa_sync_upgrades(spa, tx);
  
@@ -9182,6 +9787,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                     != NULL)
                         vdev_sync(vd, txg);
  
+               if (pass == 1) {
+                       /*
+                        * dsl_pool_sync() -> dp_sync_tasks may have dirtied
+                        * the config. If that happens, this txg should not
+                        * be a no-op. So we must sync the config to the MOS
+                        * before checking for no-op.
+                        *
+                        * Note that when the config is dirty, it will
+                        * be written to the MOS (i.e. the MOS will be
+                        * dirtied) every time we call spa_sync_config_object()
+                        * in this txg.  Therefore we can't call this after
+                        * dsl_pool_sync() every pass, because it would
+                        * prevent us from converging, since we'd dirty
+                        * the MOS every pass.
+                        *
+                        * Sync tasks can only be processed in pass 1, so
+                        * there's no need to do this in later passes.
+                        */
+                       spa_sync_config_object(spa, tx);
+               }
+
                 /*
                  * Note: We need to check if the MOS is dirty because we could
                  * have marked the MOS dirty without updating the uberblock
@@ -9447,6 +10073,9 @@ spa_sync(spa_t *spa, uint64_t txg)
  
         spa_update_dspace(spa);
  
+       if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
+               vdev_autotrim_kick(spa);
+
         /*
          * It had better be the case that we didn't dirty anything
          * since vdev_config_sync().
@@ -9500,6 +10129,104 @@ spa_sync_allpools(void)
         mutex_exit(&spa_namespace_lock);
  }
  
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+       kthread_t **kthreads;
+
+       ASSERT(spa->spa_sync_tq == NULL);
+       ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+       /*
+        * - do not allow more allocators than cpus.
+        * - there may be more cpus than allocators.
+        * - do not allow more sync taskq threads than allocators or cpus.
+        */
+       int nthreads = spa->spa_alloc_count;
+       spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+           nthreads, KM_SLEEP);
+
+       spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+           nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+       VERIFY(spa->spa_sync_tq != NULL);
+       VERIFY(kthreads != NULL);
+
+       spa_taskqs_t *tqs =
+           &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+       spa_syncthread_info_t *ti = spa->spa_syncthreads;
+       for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+               ti->sti_thread = kthreads[i];
+               if (w == tqs->stqs_count) {
+                       w = 0;
+               }
+               ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+       }
+
+       kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+       return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+       ASSERT(spa->spa_sync_tq != NULL);
+
+       taskq_wait(spa->spa_sync_tq);
+       taskq_destroy(spa->spa_sync_tq);
+       kmem_free(spa->spa_syncthreads,
+           sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+       spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+       zbookmark_phys_t *bm = &zio->io_bookmark;
+       spa_t *spa = zio->io_spa;
+
+       ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+       /*
+        * A gang block (for example) may have inherited its parent's
+        * allocator, in which case there is nothing further to do here.
+        */
+       if (ZIO_HAS_ALLOCATOR(zio))
+               return;
+
+       ASSERT(spa != NULL);
+       ASSERT(bm != NULL);
+
+       /*
+        * First try to use an allocator assigned to the syncthread, and set
+        * the corresponding write issue taskq for the allocator.
+        * Note, we must have an open pool to do this.
+        */
+       if (spa->spa_sync_tq != NULL) {
+               spa_syncthread_info_t *ti = spa->spa_syncthreads;
+               for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+                       if (ti->sti_thread == curthread) {
+                               zio->io_allocator = i;
+                               zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+                               return;
+                       }
+               }
+       }
+
+       /*
+        * We want to try to use as many allocators as possible to help improve
+        * performance, but we also want logically adjacent IOs to be physically
+        * adjacent to improve sequential read performance. We chunk each object
+        * into 2^20 block regions, and then hash based on the objset, object,
+        * level, and region to accomplish both of these goals.
+        */
+       uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+           bm->zb_blkid >> 20);
+
+       zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+       zio->io_wr_iss_tq = NULL;
+}
+
  /*
   * ==========================================================================
   * Miscellaneous routines
@@ -9838,7 +10565,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                     DSS_SCANNING);
                 break;
         case ZPOOL_WAIT_RESILVER:
-               if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+               *in_progress = vdev_rebuild_active(spa->spa_root_vdev);
+               if (*in_progress)
                         break;
                 zfs_fallthrough;
         case ZPOOL_WAIT_SCRUB:
@@ -9853,6 +10581,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                     is_scrub == (activity == ZPOOL_WAIT_SCRUB));
                 break;
         }
+       case ZPOOL_WAIT_RAIDZ_EXPAND:
+       {
+               vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+               *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
+               break;
+       }
         default:
                 panic("unrecognized value for activity %d", activity);
         }
@@ -10043,6 +10777,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
  /* asynchronous event notification */
  EXPORT_SYMBOL(spa_event_notify);
  
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
+       "Percentage of CPUs to run a metaslab preload taskq");
+
  /* BEGIN CSTYLED */
  ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
         "log2 fraction of arc that can be used by inflight I/Os when "
@@ -10089,4 +10826,16 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
         ZMOD_RW,
         "Whether extra ALLOC blkptrs were added to a livelist entry while it "
         "was being condensed");
+
+#ifdef _KERNEL
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
+       spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+       "Configure IO queues for read IO");
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
+       spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+       "Configure IO queues for write IO");
+#endif
  /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+       "Number of CPUs to run write issue taskqs");