]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/spa.c
Fix stack frame size: spa_livelist_delete_cb()
[mirror_zfs.git] / module / zfs / spa.c
index 71744139e7cd032fb6c4277cc477b327421a6669..8c662f6b066668b5ad57abbe05234472e9b3d121 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -29,7 +29,7 @@
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2018 Joyent, Inc.
- * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  */
@@ -57,6 +57,8 @@
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -89,6 +91,7 @@
 #include <sys/fm/util.h>
 #include <sys/callb.h>
 #include <sys/zone.h>
+#include <sys/vmsystm.h>
 #endif /* _KERNEL */
 
 #include "zfs_prop.h"
@@ -132,7 +135,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_BATCH
  * macro causes us to create a taskq oriented for throughput. Some operations
- * are so high frequency and short-lived that the taskq itself can become a a
+ * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
@@ -150,6 +153,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
        { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
+       { ZTI_N(4),     ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
@@ -230,6 +234,27 @@ uint64_t   zfs_max_missing_tvds_scan = 0;
  */
 boolean_t      zfs_pause_spa_sync = B_FALSE;
 
+/*
+ * Variables to indicate the livelist condense zthr func should wait at certain
+ * points for the livelist to be removed - used to test condense/destroy races
+ */
+int zfs_livelist_condense_zthr_pause = 0;
+int zfs_livelist_condense_sync_pause = 0;
+
+/*
+ * Variables to track whether or not condense cancellation has been
+ * triggered in testing.
+ */
+int zfs_livelist_condense_sync_cancel = 0;
+int zfs_livelist_condense_zthr_cancel = 0;
+
+/*
+ * Variable to track whether or not extra ALLOC blkptrs were added to a
+ * livelist entry while it was being condensed (caused by the way we track
+ * remapped blkptrs in dbuf_remap_impl)
+ */
+int zfs_livelist_condense_new_alloc = 0;
+
 /*
  * ==========================================================================
  * SPA properties routines
@@ -295,7 +320,7 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
                    metaslab_class_expandable_space(mc), src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
-                   (spa_mode(spa) == FREAD), src);
+                   (spa_mode(spa) == SPA_MODE_READ), src);
 
                cap = (size == 0) ? 0 : (alloc * 100 / size);
                spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
@@ -389,12 +414,15 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
        objset_t *mos = spa->spa_meta_objset;
        zap_cursor_t zc;
        zap_attribute_t za;
+       dsl_pool_t *dp;
        int err;
 
        err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
        if (err)
                return (err);
 
+       dp = spa_get_dsl(spa);
+       dsl_pool_config_enter(dp, FTAG);
        mutex_enter(&spa->spa_props_lock);
 
        /*
@@ -403,10 +431,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
        spa_prop_get_config(spa, nvp);
 
        /* If no pool property object, no more prop to get. */
-       if (mos == NULL || spa->spa_pool_props_object == 0) {
-               mutex_exit(&spa->spa_props_lock);
+       if (mos == NULL || spa->spa_pool_props_object == 0)
                goto out;
-       }
 
        /*
         * Get properties from the MOS pool property object.
@@ -430,23 +456,17 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
                                src = ZPROP_SRC_LOCAL;
 
                        if (prop == ZPOOL_PROP_BOOTFS) {
-                               dsl_pool_t *dp;
                                dsl_dataset_t *ds = NULL;
 
-                               dp = spa_get_dsl(spa);
-                               dsl_pool_config_enter(dp, FTAG);
                                err = dsl_dataset_hold_obj(dp,
                                    za.za_first_integer, FTAG, &ds);
-                               if (err != 0) {
-                                       dsl_pool_config_exit(dp, FTAG);
+                               if (err != 0)
                                        break;
-                               }
 
                                strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
                                    KM_SLEEP);
                                dsl_dataset_name(ds, strval);
                                dsl_dataset_rele(ds, FTAG);
-                               dsl_pool_config_exit(dp, FTAG);
                        } else {
                                strval = NULL;
                                intval = za.za_first_integer;
@@ -477,8 +497,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
                }
        }
        zap_cursor_fini(&zc);
-       mutex_exit(&spa->spa_props_lock);
 out:
+       mutex_exit(&spa->spa_props_lock);
+       dsl_pool_config_exit(dp, FTAG);
        if (err && err != ENOENT) {
                nvlist_free(*nvp);
                *nvp = NULL;
@@ -554,6 +575,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                case ZPOOL_PROP_AUTOREPLACE:
                case ZPOOL_PROP_LISTSNAPS:
                case ZPOOL_PROP_AUTOEXPAND:
+               case ZPOOL_PROP_AUTOTRIM:
                        error = nvpair_value_uint64(elem, &intval);
                        if (!error && intval > 1)
                                error = SET_ERROR(EINVAL);
@@ -564,8 +586,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                        if (!error && intval > 1)
                                error = SET_ERROR(EINVAL);
 
-                       if (!error && !spa_get_hostid())
-                               error = SET_ERROR(ENOTSUP);
+                       if (!error) {
+                               uint32_t hostid = zone_get_hostid(NULL);
+                               if (hostid)
+                                       spa->spa_hostid = hostid;
+                               else
+                                       error = SET_ERROR(ENOTSUP);
+                       }
 
                        break;
 
@@ -594,7 +621,6 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 
                        if (!error) {
                                objset_t *os;
-                               uint64_t propval;
 
                                if (strval == NULL || strval[0] == '\0') {
                                        objnum = zpool_prop_default_numeric(
@@ -606,27 +632,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                if (error != 0)
                                        break;
 
-                               /*
-                                * Must be ZPL, and its property settings
-                                * must be supported by GRUB (compression
-                                * is not gzip, and large blocks or large
-                                * dnodes are not used).
-                                */
-
+                               /* Must be ZPL. */
                                if (dmu_objset_type(os) != DMU_OST_ZFS) {
                                        error = SET_ERROR(ENOTSUP);
-                               } else if ((error =
-                                   dsl_prop_get_int_ds(dmu_objset_ds(os),
-                                   zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-                                   &propval)) == 0 &&
-                                   !BOOTFS_COMPRESS_VALID(propval)) {
-                                       error = SET_ERROR(ENOTSUP);
-                               } else if ((error =
-                                   dsl_prop_get_int_ds(dmu_objset_ds(os),
-                                   zfs_prop_to_name(ZFS_PROP_DNODESIZE),
-                                   &propval)) == 0 &&
-                                   propval != ZFS_DNSIZE_LEGACY) {
-                                       error = SET_ERROR(ENOTSUP);
                                } else {
                                        objnum = dmu_objset_id(os);
                                }
@@ -691,16 +699,6 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                error = SET_ERROR(E2BIG);
                        break;
 
-               case ZPOOL_PROP_DEDUPDITTO:
-                       if (spa_version(spa) < SPA_VERSION_DEDUP)
-                               error = SET_ERROR(ENOTSUP);
-                       else
-                               error = nvpair_value_uint64(elem, &intval);
-                       if (error == 0 &&
-                           intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
-                               error = SET_ERROR(EINVAL);
-                       break;
-
                default:
                        break;
                }
@@ -709,6 +707,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                        break;
        }
 
+       (void) nvlist_remove_all(props,
+           zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
+
        if (!error && reset_bootfs) {
                error = nvlist_remove(props,
                    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
@@ -824,7 +825,7 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
-       ASSERTV(uint64_t *newguid = arg);
+       uint64_t *newguid __maybe_unused = arg;
        spa_t *spa = dmu_tx_pool(tx)->dp_spa;
        vdev_t *rvd = spa->spa_root_vdev;
        uint64_t vdev_state;
@@ -864,7 +865,7 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx)
        spa_config_exit(spa, SCL_STATE, FTAG);
 
        spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
-           oldguid, *newguid);
+           (u_longlong_t)oldguid, (u_longlong_t)*newguid);
 }
 
 /*
@@ -916,7 +917,7 @@ spa_error_entry_compare(const void *a, const void *b)
        ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
            sizeof (zbookmark_phys_t));
 
-       return (AVL_ISIGN(ret));
+       return (TREE_ISIGN(ret));
 }
 
 /*
@@ -999,13 +1000,25 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                        /*
                         * The write issue taskq can be extremely CPU
                         * intensive.  Run it at slightly less important
-                        * priority than the other taskqs.  Under Linux this
-                        * means incrementing the priority value on platforms
-                        * like illumos it should be decremented.
+                        * priority than the other taskqs.
+                        *
+                        * Under Linux and FreeBSD this means incrementing
+                        * the priority value as opposed to platforms like
+                        * illumos where it should be decremented.
+                        *
+                        * On FreeBSD, if priorities divided by four (RQ_PPQ)
+                        * are equal then a difference between them is
+                        * insignificant.
                         */
-                       if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+                       if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
+#if defined(__linux__)
                                pri++;
-
+#elif defined(__FreeBSD__)
+                               pri += 4;
+#else
+#error "unknown OS"
+#endif
+                       }
                        tq = taskq_create_proc(name, value, pri, 50,
                            INT_MAX, spa->spa_proc, flags);
                }
@@ -1173,7 +1186,7 @@ spa_thread(void *arg)
  * Activate an uninitialized pool.
  */
 static void
-spa_activate(spa_t *spa, int mode)
+spa_activate(spa_t *spa, spa_mode_t mode)
 {
        ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
@@ -1383,7 +1396,7 @@ spa_deactivate(spa_t *spa)
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
-static int
+int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
@@ -1424,26 +1437,122 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
        return (0);
 }
 
+static boolean_t
+spa_should_flush_logs_on_unload(spa_t *spa)
+{
+       if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+               return (B_FALSE);
+
+       if (!spa_writeable(spa))
+               return (B_FALSE);
+
+       if (!spa->spa_sync_on)
+               return (B_FALSE);
+
+       if (spa_state(spa) != POOL_STATE_EXPORTED)
+               return (B_FALSE);
+
+       if (zfs_keep_log_spacemaps_at_export)
+               return (B_FALSE);
+
+       return (B_TRUE);
+}
+
+/*
+ * Opens a transaction that will set the flag that will instruct
+ * spa_sync to attempt to flush all the metaslabs for that txg.
+ */
+static void
+spa_unload_log_sm_flush_all(spa_t *spa)
+{
+       dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+       ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
+       spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
+
+       dmu_tx_commit(tx);
+       txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
+}
+
+static void
+spa_unload_log_sm_metadata(spa_t *spa)
+{
+       void *cookie = NULL;
+       spa_log_sm_t *sls;
+       while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
+           &cookie)) != NULL) {
+               VERIFY0(sls->sls_mscount);
+               kmem_free(sls, sizeof (spa_log_sm_t));
+       }
+
+       for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+           e != NULL; e = list_head(&spa->spa_log_summary)) {
+               VERIFY0(e->lse_mscount);
+               list_remove(&spa->spa_log_summary, e);
+               kmem_free(e, sizeof (log_summary_entry_t));
+       }
+
+       spa->spa_unflushed_stats.sus_nblocks = 0;
+       spa->spa_unflushed_stats.sus_memused = 0;
+       spa->spa_unflushed_stats.sus_blocklimit = 0;
+}
+
+static void
+spa_destroy_aux_threads(spa_t *spa)
+{
+       if (spa->spa_condense_zthr != NULL) {
+               zthr_destroy(spa->spa_condense_zthr);
+               spa->spa_condense_zthr = NULL;
+       }
+       if (spa->spa_checkpoint_discard_zthr != NULL) {
+               zthr_destroy(spa->spa_checkpoint_discard_zthr);
+               spa->spa_checkpoint_discard_zthr = NULL;
+       }
+       if (spa->spa_livelist_delete_zthr != NULL) {
+               zthr_destroy(spa->spa_livelist_delete_zthr);
+               spa->spa_livelist_delete_zthr = NULL;
+       }
+       if (spa->spa_livelist_condense_zthr != NULL) {
+               zthr_destroy(spa->spa_livelist_condense_zthr);
+               spa->spa_livelist_condense_zthr = NULL;
+       }
+}
+
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
-       int i;
-
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
+       spa_import_progress_remove(spa_guid(spa));
        spa_load_note(spa, "UNLOADING");
 
+       spa_wake_waiters(spa);
+
+       /*
+        * If the log space map feature is enabled and the pool is getting
+        * exported (but not destroyed), we want to spend some time flushing
+        * as many metaslabs as we can in an attempt to destroy log space
+        * maps and save import time.
+        */
+       if (spa_should_flush_logs_on_unload(spa))
+               spa_unload_log_sm_flush_all(spa);
+
        /*
         * Stop async tasks.
         */
        spa_async_suspend(spa);
 
        if (spa->spa_root_vdev) {
-               vdev_initialize_stop_all(spa->spa_root_vdev,
-                   VDEV_INITIALIZE_ACTIVE);
+               vdev_t *root_vdev = spa->spa_root_vdev;
+               vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
+               vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+               vdev_autotrim_stop_all(spa);
+               vdev_rebuild_stop_all(spa);
        }
 
        /*
@@ -1455,16 +1564,15 @@ spa_unload(spa_t *spa)
        }
 
        /*
-        * Even though vdev_free() also calls vdev_metaslab_fini, we need
-        * to call it earlier, before we wait for async i/o to complete.
-        * This ensures that there is no async metaslab prefetching, by
-        * calling taskq_wait(mg_taskq).
+        * This ensures that there is no async metaslab prefetching
+        * while we attempt to unload the spa.
         */
        if (spa->spa_root_vdev != NULL) {
-               spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
-               for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
-                       vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
-               spa_config_exit(spa, SCL_ALL, spa);
+               for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+                       vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
+                       if (vc->vdev_mg != NULL)
+                               taskq_wait(vc->vdev_mg->mg_taskq);
+               }
        }
 
        if (spa->spa_mmp.mmp_thread)
@@ -1485,15 +1593,7 @@ spa_unload(spa_t *spa)
                spa->spa_vdev_removal = NULL;
        }
 
-       if (spa->spa_condense_zthr != NULL) {
-               zthr_destroy(spa->spa_condense_zthr);
-               spa->spa_condense_zthr = NULL;
-       }
-
-       if (spa->spa_checkpoint_discard_zthr != NULL) {
-               zthr_destroy(spa->spa_checkpoint_discard_zthr);
-               spa->spa_checkpoint_discard_zthr = NULL;
-       }
+       spa_destroy_aux_threads(spa);
 
        spa_condense_fini(spa);
 
@@ -1518,13 +1618,14 @@ spa_unload(spa_t *spa)
        }
 
        ddt_unload(spa);
+       spa_unload_log_sm_metadata(spa);
 
        /*
         * Drop and purge level 2 cache
         */
        spa_l2cache_drop(spa);
 
-       for (i = 0; i < spa->spa_spares.sav_count; i++)
+       for (int i = 0; i < spa->spa_spares.sav_count; i++)
                vdev_free(spa->spa_spares.sav_vdevs[i]);
        if (spa->spa_spares.sav_vdevs) {
                kmem_free(spa->spa_spares.sav_vdevs,
@@ -1537,7 +1638,7 @@ spa_unload(spa_t *spa)
        }
        spa->spa_spares.sav_count = 0;
 
-       for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+       for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
                vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
                vdev_free(spa->spa_l2cache.sav_vdevs[i]);
        }
@@ -1786,6 +1887,15 @@ spa_load_l2cache(spa_t *spa)
 
                        if (!vdev_is_dead(vd))
                                l2arc_add_vdev(spa, vd);
+
+                       /*
+                        * Upon cache device addition to a pool or pool
+                        * creation with a cache device or if the header
+                        * of the device is invalid we issue an async
+                        * TRIM command for the whole device which will
+                        * execute if l2arc_trim_ahead > 0.
+                        */
+                       spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
                }
        }
 
@@ -2095,16 +2205,16 @@ spa_load_verify_done(zio_t *zio)
        }
 
        mutex_enter(&spa->spa_scrub_lock);
-       spa->spa_load_verify_ios--;
+       spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
        cv_broadcast(&spa->spa_scrub_io_cv);
        mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
- * Maximum number of concurrent scrub i/os to create while verifying
- * a pool while importing it.
+ * Maximum number of inflight bytes is the log2 fraction of the arc size.
+ * By default, we set it to 1/16th of the arc.
  */
-int spa_load_verify_maxinflight = 10000;
+int spa_load_verify_shift = 4;
 int spa_load_verify_metadata = B_TRUE;
 int spa_load_verify_data = B_TRUE;
 
@@ -2113,7 +2223,8 @@ static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-       if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+       if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+           BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
                return (0);
        /*
         * Note: normally this routine will not be called if
@@ -2125,13 +2236,15 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
                return (0);
 
+       uint64_t maxinflight_bytes =
+           arc_target_bytes() >> spa_load_verify_shift;
        zio_t *rio = arg;
        size_t size = BP_GET_PSIZE(bp);
 
        mutex_enter(&spa->spa_scrub_lock);
-       while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
+       while (spa->spa_load_verify_bytes >= maxinflight_bytes)
                cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-       spa->spa_load_verify_ios++;
+       spa->spa_load_verify_bytes += size;
        mutex_exit(&spa->spa_scrub_lock);
 
        zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
@@ -2142,7 +2255,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 }
 
 /* ARGSUSED */
-int
+static int
 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
        if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
@@ -2184,12 +2297,14 @@ spa_load_verify(spa_t *spa)
                            "spa_load_verify_metadata=%u)",
                            spa_load_verify_data, spa_load_verify_metadata);
                }
+
                error = traverse_pool(spa, spa->spa_verify_min_txg,
                    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
                    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
        }
 
        (void) zio_wait(rio);
+       ASSERT0(spa->spa_load_verify_bytes);
 
        spa->spa_load_meta_errors = sle.sle_meta_count;
        spa->spa_load_data_errors = sle.sle_data_count;
@@ -2266,6 +2381,381 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
        return (SET_ERROR(err));
 }
 
+boolean_t
+spa_livelist_delete_check(spa_t *spa)
+{
+       return (spa->spa_livelists_to_delete != 0);
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_livelist_delete_cb_check(void *arg, zthr_t *z)
+{
+       spa_t *spa = arg;
+       return (spa_livelist_delete_check(spa));
+}
+
+static int
+delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       spa_t *spa = arg;
+       zio_free(spa, tx->tx_txg, bp);
+       dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+           -bp_get_dsize_sync(spa, bp),
+           -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+       return (0);
+}
+
+static int
+dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
+{
+       int err;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       zap_cursor_init(&zc, os, zap_obj);
+       err = zap_cursor_retrieve(&zc, &za);
+       zap_cursor_fini(&zc);
+       if (err == 0)
+               *llp = za.za_first_integer;
+       return (err);
+}
+
+/*
+ * Components of livelist deletion that must be performed in syncing
+ * context: freeing block pointers and updating the pool-wide data
+ * structures to indicate how much work is left to do
+ */
+typedef struct sublist_delete_arg {
+       spa_t *spa;
+       dsl_deadlist_t *ll;
+       uint64_t key;
+       bplist_t *to_free;
+} sublist_delete_arg_t;
+
+static void
+sublist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+       sublist_delete_arg_t *sda = arg;
+       spa_t *spa = sda->spa;
+       dsl_deadlist_t *ll = sda->ll;
+       uint64_t key = sda->key;
+       bplist_t *to_free = sda->to_free;
+
+       bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
+       dsl_deadlist_remove_entry(ll, key, tx);
+}
+
+typedef struct livelist_delete_arg {
+       spa_t *spa;
+       uint64_t ll_obj;
+       uint64_t zap_obj;
+} livelist_delete_arg_t;
+
+static void
+livelist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+       livelist_delete_arg_t *lda = arg;
+       spa_t *spa = lda->spa;
+       uint64_t ll_obj = lda->ll_obj;
+       uint64_t zap_obj = lda->zap_obj;
+       objset_t *mos = spa->spa_meta_objset;
+       uint64_t count;
+
+       /* free the livelist and decrement the feature count */
+       VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
+       dsl_deadlist_free(mos, ll_obj, tx);
+       spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+       VERIFY0(zap_count(mos, zap_obj, &count));
+       if (count == 0) {
+               /* no more livelists to delete */
+               VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_DELETED_CLONES, tx));
+               VERIFY0(zap_destroy(mos, zap_obj, tx));
+               spa->spa_livelists_to_delete = 0;
+               spa_notify_waiters(spa);
+       }
+}
+
+/*
+ * Load in the value for the livelist to be removed and open it. Then,
+ * load its first sublist and determine which block pointers should actually
+ * be freed. Then, call a synctask which performs the actual frees and updates
+ * the pool-wide livelist data.
+ */
+/* ARGSUSED */
+static void
+spa_livelist_delete_cb(void *arg, zthr_t *z)
+{
+       spa_t *spa = arg;
+       uint64_t ll_obj = 0, count;
+       objset_t *mos = spa->spa_meta_objset;
+       uint64_t zap_obj = spa->spa_livelists_to_delete;
+       /*
+        * Determine the next livelist to delete. This function should only
+        * be called if there is at least one deleted clone.
+        */
+       VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
+       VERIFY0(zap_count(mos, ll_obj, &count));
+       if (count > 0) {
+               dsl_deadlist_t *ll;
+               dsl_deadlist_entry_t *dle;
+               bplist_t to_free;
+               ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
+               dsl_deadlist_open(ll, mos, ll_obj);
+               dle = dsl_deadlist_first(ll);
+               ASSERT3P(dle, !=, NULL);
+               bplist_create(&to_free);
+               int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
+                   z, NULL);
+               if (err == 0) {
+                       sublist_delete_arg_t sync_arg = {
+                           .spa = spa,
+                           .ll = ll,
+                           .key = dle->dle_mintxg,
+                           .to_free = &to_free
+                       };
+                       zfs_dbgmsg("deleting sublist (id %llu) from"
+                           " livelist %llu, %d remaining",
+                           dle->dle_bpobj.bpo_object, ll_obj, count - 1);
+                       VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+                           sublist_delete_sync, &sync_arg, 0,
+                           ZFS_SPACE_CHECK_DESTROY));
+               } else {
+                       VERIFY3U(err, ==, EINTR);
+               }
+               bplist_clear(&to_free);
+               bplist_destroy(&to_free);
+               dsl_deadlist_close(ll);
+               kmem_free(ll, sizeof (dsl_deadlist_t));
+       } else {
+               livelist_delete_arg_t sync_arg = {
+                   .spa = spa,
+                   .ll_obj = ll_obj,
+                   .zap_obj = zap_obj
+               };
+               zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
+               VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
+                   &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
+       }
+}
+
+static void
+spa_start_livelist_destroy_thread(spa_t *spa)
+{
+       ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
+       spa->spa_livelist_delete_zthr =
+           zthr_create("z_livelist_destroy",
+           spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
+}
+
+typedef struct livelist_new_arg {
+       bplist_t *allocs;
+       bplist_t *frees;
+} livelist_new_arg_t;
+
+static int
+livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+       ASSERT(tx == NULL);
+       livelist_new_arg_t *lna = arg;
+       if (bp_freed) {
+               bplist_append(lna->frees, bp);
+       } else {
+               bplist_append(lna->allocs, bp);
+               zfs_livelist_condense_new_alloc++;
+       }
+       return (0);
+}
+
+typedef struct livelist_condense_arg {
+       spa_t *spa;
+       bplist_t to_keep;
+       uint64_t first_size;
+       uint64_t next_size;
+} livelist_condense_arg_t;
+
+static void
+spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
+{
+       livelist_condense_arg_t *lca = arg;
+       spa_t *spa = lca->spa;
+       bplist_t new_frees;
+       dsl_dataset_t *ds = spa->spa_to_condense.ds;
+
+       /* Have we been cancelled? */
+       if (spa->spa_to_condense.cancelled) {
+               zfs_livelist_condense_sync_cancel++;
+               goto out;
+       }
+
+       dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+       dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+       dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+
+       /*
+        * It's possible that the livelist was changed while the zthr was
+        * running. Therefore, we need to check for new blkptrs in the two
+        * entries being condensed and continue to track them in the livelist.
+        * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
+        * it's possible that the newly added blkptrs are FREEs or ALLOCs so
+        * we need to sort them into two different bplists.
+        */
+       uint64_t first_obj = first->dle_bpobj.bpo_object;
+       uint64_t next_obj = next->dle_bpobj.bpo_object;
+       uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+       uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+
+       bplist_create(&new_frees);
+       livelist_new_arg_t new_bps = {
+           .allocs = &lca->to_keep,
+           .frees = &new_frees,
+       };
+
+       if (cur_first_size > lca->first_size) {
+               VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
+                   livelist_track_new_cb, &new_bps, lca->first_size));
+       }
+       if (cur_next_size > lca->next_size) {
+               VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
+                   livelist_track_new_cb, &new_bps, lca->next_size));
+       }
+
+       dsl_deadlist_clear_entry(first, ll, tx);
+       ASSERT(bpobj_is_empty(&first->dle_bpobj));
+       dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
+
+       bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
+       bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
+       bplist_destroy(&new_frees);
+
+       char dsname[ZFS_MAX_DATASET_NAME_LEN];
+       dsl_dataset_name(ds, dsname);
+       zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
+           "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
+           "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
+           cur_first_size, next_obj, cur_next_size,
+           first->dle_bpobj.bpo_object,
+           first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
+out:
+       dmu_buf_rele(ds->ds_dbuf, spa);
+       spa->spa_to_condense.ds = NULL;
+       bplist_clear(&lca->to_keep);
+       bplist_destroy(&lca->to_keep);
+       kmem_free(lca, sizeof (livelist_condense_arg_t));
+       spa->spa_to_condense.syncing = B_FALSE;
+}
+
+static void
+spa_livelist_condense_cb(void *arg, zthr_t *t)
+{
+       while (zfs_livelist_condense_zthr_pause &&
+           !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+               delay(1);
+
+       spa_t *spa = arg;
+       dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+       dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+       uint64_t first_size, next_size;
+
+       livelist_condense_arg_t *lca =
+           kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
+       bplist_create(&lca->to_keep);
+
+       /*
+        * Process the livelists (matching FREEs and ALLOCs) in open context
+        * so we have minimal work in syncing context to condense.
+        *
+        * We save bpobj sizes (first_size and next_size) to use later in
+        * syncing context to determine if entries were added to these sublists
+        * while in open context. This is possible because the clone is still
+        * active and open for normal writes and we want to make sure the new,
+        * unprocessed blockpointers are inserted into the livelist normally.
+        *
+        * Note that dsl_process_sub_livelist() both stores the size number of
+        * blockpointers and iterates over them while the bpobj's lock held, so
+        * the sizes returned to us are consistent which what was actually
+        * processed.
+        */
+       int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
+           &first_size);
+       if (err == 0)
+               err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
+                   t, &next_size);
+
+       if (err == 0) {
+               while (zfs_livelist_condense_sync_pause &&
+                   !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+                       delay(1);
+
+               dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+               dmu_tx_mark_netfree(tx);
+               dmu_tx_hold_space(tx, 1);
+               err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
+               if (err == 0) {
+                       /*
+                        * Prevent the condense zthr restarting before
+                        * the synctask completes.
+                        */
+                       spa->spa_to_condense.syncing = B_TRUE;
+                       lca->spa = spa;
+                       lca->first_size = first_size;
+                       lca->next_size = next_size;
+                       dsl_sync_task_nowait(spa_get_dsl(spa),
+                           spa_livelist_condense_sync, lca, tx);
+                       dmu_tx_commit(tx);
+                       return;
+               }
+       }
+       /*
+        * Condensing can not continue: either it was externally stopped or
+        * we were unable to assign to a tx because the pool has run out of
+        * space. In the second case, we'll just end up trying to condense
+        * again in a later txg.
+        */
+       ASSERT(err != 0);
+       bplist_clear(&lca->to_keep);
+       bplist_destroy(&lca->to_keep);
+       kmem_free(lca, sizeof (livelist_condense_arg_t));
+       dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
+       spa->spa_to_condense.ds = NULL;
+       if (err == EINTR)
+               zfs_livelist_condense_zthr_cancel++;
+}
+
+/* ARGSUSED */
+/*
+ * Check that there is something to condense but that a condense is not
+ * already in progress and that condensing has not been cancelled.
+ */
+static boolean_t
+spa_livelist_condense_cb_check(void *arg, zthr_t *z)
+{
+       spa_t *spa = arg;
+       if ((spa->spa_to_condense.ds != NULL) &&
+           (spa->spa_to_condense.syncing == B_FALSE) &&
+           (spa->spa_to_condense.cancelled == B_FALSE)) {
+               return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
+static void
+spa_start_livelist_condensing_thread(spa_t *spa)
+{
+       spa->spa_to_condense.ds = NULL;
+       spa->spa_to_condense.first = NULL;
+       spa->spa_to_condense.next = NULL;
+       spa->spa_to_condense.syncing = B_FALSE;
+       spa->spa_to_condense.cancelled = B_FALSE;
+
+       ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
+       spa->spa_livelist_condense_zthr =
+           zthr_create("z_livelist_condense",
+           spa_livelist_condense_cb_check,
+           spa_livelist_condense_cb, spa);
+}
+
 static void
 spa_spawn_aux_threads(spa_t *spa)
 {
@@ -2274,10 +2764,13 @@ spa_spawn_aux_threads(spa_t *spa)
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
        spa_start_indirect_condensing_thread(spa);
+       spa_start_livelist_destroy_thread(spa);
+       spa_start_livelist_condensing_thread(spa);
 
        ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
        spa->spa_checkpoint_discard_zthr =
-           zthr_create(spa_checkpoint_discard_thread_check,
+           zthr_create("z_checkpoint_discard",
+           spa_checkpoint_discard_thread_check,
            spa_checkpoint_discard_thread, spa);
 }
 
@@ -2370,6 +2863,8 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
        int error;
 
        spa->spa_load_state = state;
+       (void) spa_import_progress_set_state(spa_guid(spa),
+           spa_load_state(spa));
 
        gethrestime(&spa->spa_loaded_ts);
        error = spa_load_impl(spa, type, &ereport);
@@ -2386,12 +2881,16 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
                        spa->spa_loaded_ts.tv_nsec = 0;
                }
                if (error != EBADF) {
-                       zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0);
+                       (void) zfs_ereport_post(ereport, spa,
+                           NULL, NULL, NULL, 0);
                }
        }
        spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
        spa->spa_ena = 0;
 
+       (void) spa_import_progress_set_state(spa_guid(spa),
+           spa_load_state(spa));
+
        return (error);
 }
 
@@ -2464,6 +2963,7 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
         */
        if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
                return (B_FALSE);
+
        /*
         * If the tryconfig_ values are nonzero, they are the results of an
         * earlier tryimport.  If they all match the uberblock we just found,
@@ -2484,7 +2984,7 @@ spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
        if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
                hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 
-       if (hostid == spa_get_hostid())
+       if (hostid == spa_get_hostid(spa))
                return (B_FALSE);
 
        /*
@@ -2545,7 +3045,7 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 
        } else if (MMP_VALID(ub)) {
                /*
-                * zfs-0.7 compatability case
+                * zfs-0.7 compatibility case
                 */
 
                import_delay = MAX(import_delay, (multihost_interval +
@@ -2612,10 +3112,14 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
        import_delay = spa_activity_check_duration(spa, ub);
 
        /* Add a small random factor in case of simultaneous imports (0-25%) */
-       import_expire = gethrtime() + import_delay +
-           (import_delay * spa_get_random(250) / 1000);
+       import_delay += import_delay * spa_get_random(250) / 1000;
+
+       import_expire = gethrtime() + import_delay;
 
        while (gethrtime() < import_expire) {
+               (void) spa_import_progress_set_mmp_check(spa_guid(spa),
+                   NSEC2SEC(import_expire - gethrtime()));
+
                vdev_uberblock_load(rvd, ub, &mmp_label);
 
                if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
@@ -2711,7 +3215,8 @@ spa_verify_host(spa_t *spa, nvlist_t *mos_config)
                        cmn_err(CE_WARN, "pool '%s' could not be "
                            "loaded as it was last accessed by "
                            "another system (host: %s hostid: 0x%llx). "
-                           "See: http://illumos.org/msg/ZFS-8000-EY",
+                           "See: https://openzfs.github.io/openzfs-docs/msg/"
+                           "ZFS-8000-EY",
                            spa_name(spa), hostname, (u_longlong_t)hostid);
                        spa_load_failed(spa, "hostid verification failed: pool "
                            "last accessed by host: %s (hostid: 0x%llx)",
@@ -2864,7 +3369,7 @@ spa_ld_open_vdevs(spa_t *spa)
        if (spa->spa_missing_tvds != 0) {
                spa_load_note(spa, "vdev tree has %lld missing top-level "
                    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
-               if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
+               if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
                        /*
                         * Although theoretically we could allow users to open
                         * incomplete pools in RW mode, we'd need to add a lot
@@ -2982,6 +3487,10 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
        }
 
+       if (spa->spa_load_max_txg != UINT64_MAX) {
+               (void) spa_import_progress_set_max_txg(spa_guid(spa),
+                   (u_longlong_t)spa->spa_load_max_txg);
+       }
        spa_load_note(spa, "using uberblock with txg=%llu",
            (u_longlong_t)ub->ub_txg);
 
@@ -2995,7 +3504,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
            spa->spa_config);
        if (activity_check) {
                if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
-                   spa_get_hostid() == 0) {
+                   spa_get_hostid(spa) == 0) {
                        nvlist_free(label);
                        fnvlist_add_uint64(spa->spa_load_info,
                            ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
@@ -3520,6 +4029,15 @@ spa_ld_get_props(spa_t *spa)
        if (error != 0 && error != ENOENT)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
+       /*
+        * Load the livelist deletion field. If a livelist is queued for
+        * deletion, indicate that in the spa
+        */
+       error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
+           &spa->spa_livelists_to_delete, B_FALSE);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
        /*
         * Load the history object.  If we have an older pool, this
         * will not be present.
@@ -3583,9 +4101,7 @@ spa_ld_get_props(spa_t *spa)
                spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
                spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
                spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
-               spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
-                   &spa->spa_dedup_ditto);
-
+               spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
                spa->spa_autoreplace = (autoreplace != 0);
        }
 
@@ -3675,7 +4191,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
         * be imported when the system hostid is zero.  The exception to
         * this rule is zdb which is always allowed to access pools.
         */
-       if (spa_multihost(spa) && spa_get_hostid() == 0 &&
+       if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
            (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
                fnvlist_add_uint64(spa->spa_load_info,
                    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
@@ -3711,11 +4227,18 @@ spa_ld_load_vdev_metadata(spa_t *spa)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
        }
 
+       error = spa_ld_log_spacemaps(spa);
+       if (error != 0) {
+               spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+                   error);
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+       }
+
        /*
         * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
         */
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+       vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
        spa_config_exit(spa, SCL_ALL, FTAG);
 
        return (0);
@@ -3832,7 +4355,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
                        need_update = B_TRUE;
 
        /*
-        * Update the config cache asychronously in case we're the
+        * Update the config cache asynchronously in case we're the
         * root pool, in which case the config cache isn't writable yet.
         */
        if (need_update)
@@ -3842,7 +4365,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
 static void
 spa_ld_prepare_for_reload(spa_t *spa)
 {
-       int mode = spa->spa_mode;
+       spa_mode_t mode = spa->spa_mode;
        int async_suspended = spa->spa_async_suspended;
 
        spa_unload(spa);
@@ -3911,6 +4434,8 @@ spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
        if (error != 0)
                return (error);
 
+       spa_import_progress_add(spa);
+
        /*
         * Now that we have the vdev tree, try to open each vdev. This involves
         * opening the underlying physical device, retrieving its geometry and
@@ -4143,7 +4668,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
                        return (error);
 
                /*
-                * Redo the loading process process again with the
+                * Redo the loading process again with the
                 * checkpointed uberblock.
                 */
                spa_ld_prepare_for_reload(spa);
@@ -4302,11 +4827,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
                    update_config_cache);
 
                /*
-                * Check all DTLs to see if anything needs resilvering.
+                * Check if a rebuild was in progress and if so resume it.
+                * Then check all DTLs to see if anything needs resilvering.
+                * The resilver will be deferred if a rebuild was started.
                 */
-               if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
-                   vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+               if (vdev_rebuild_active(spa->spa_root_vdev)) {
+                       vdev_rebuild_restart(spa);
+               } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+                   vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
                        spa_async_request(spa, SPA_ASYNC_RESILVER);
+               }
 
                /*
                 * Log the fact that we booted up (so that we can detect if
@@ -4336,9 +4866,14 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 
                spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
                vdev_initialize_restart(spa->spa_root_vdev);
+               vdev_trim_restart(spa->spa_root_vdev);
+               vdev_autotrim_restart(spa);
                spa_config_exit(spa, SCL_CONFIG, FTAG);
        }
 
+       spa_import_progress_remove(spa_guid(spa));
+       spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+
        spa_load_note(spa, "LOADED");
 
        return (0);
@@ -4347,7 +4882,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
-       int mode = spa->spa_mode;
+       spa_mode_t mode = spa->spa_mode;
 
        spa_unload(spa);
        spa_deactivate(spa);
@@ -4399,6 +4934,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
                 * from previous txgs when spa_load fails.
                 */
                ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+               spa_import_progress_remove(spa_guid(spa));
                return (load_error);
        }
 
@@ -4410,6 +4946,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
 
        if (rewind_flags & ZPOOL_NEVER_REWIND) {
                nvlist_free(config);
+               spa_import_progress_remove(spa_guid(spa));
                return (load_error);
        }
 
@@ -4452,6 +4989,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
 
        if (state == SPA_LOAD_RECOVER) {
                ASSERT3P(loadinfo, ==, NULL);
+               spa_import_progress_remove(spa_guid(spa));
                return (rewind_error);
        } else {
                /* Store the rewind info as part of the initial load info */
@@ -4462,6 +5000,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
                fnvlist_free(spa->spa_load_info);
                spa->spa_load_info = loadinfo;
 
+               spa_import_progress_remove(spa_guid(spa));
                return (load_error);
        }
 }
@@ -4589,7 +5128,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
        }
 
        if (firstopen)
-               zvol_create_minors(spa, spa_name(spa), B_TRUE);
+               zvol_create_minors_recursive(spa_name(spa));
 
        *spapp = spa;
 
@@ -5095,6 +5634,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        uint64_t version, obj;
        boolean_t has_features;
        boolean_t has_encryption;
+       boolean_t has_allocclass;
        spa_feature_t feat;
        char *feat_name;
        char *poolname;
@@ -5139,6 +5679,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
        has_features = B_FALSE;
        has_encryption = B_FALSE;
+       has_allocclass = B_FALSE;
        for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
            elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
                if (zpool_prop_feature(nvpair_name(elem))) {
@@ -5148,6 +5689,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                        VERIFY0(zfeature_lookup_name(feat_name, &feat));
                        if (feat == SPA_FEATURE_ENCRYPTION)
                                has_encryption = B_TRUE;
+                       if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
+                               has_allocclass = B_TRUE;
                }
        }
 
@@ -5161,6 +5704,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                        return (error);
                }
        }
+       if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
+               spa_deactivate(spa);
+               spa_remove(spa);
+               mutex_exit(&spa_namespace_lock);
+               return (ENOTSUP);
+       }
 
        if (has_features || nvlist_lookup_uint64(props,
            zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
@@ -5176,6 +5725,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        spa->spa_removing_phys.sr_state = DSS_NONE;
        spa->spa_removing_phys.sr_removing_vdev = -1;
        spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+       spa->spa_indirect_vdevs_loaded = B_TRUE;
 
        /*
         * Create "The Godfather" zio to hold all async IOs
@@ -5212,6 +5762,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
                for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
                        vdev_t *vd = rvd->vdev_child[c];
 
+                       vdev_ashift_optimize(vd);
                        vdev_metaslab_set_size(vd);
                        vdev_expand(vd, txg);
                }
@@ -5338,6 +5889,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
        spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
        spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
+       spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 
        if (props != NULL) {
                spa_configfile_set(spa, props, B_FALSE);
@@ -5378,7 +5930,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
        char *altroot = NULL;
        spa_load_state_t state = SPA_LOAD_IMPORT;
        zpool_load_policy_t policy;
-       uint64_t mode = spa_mode_global;
+       spa_mode_t mode = spa_mode_global;
        uint64_t readonly = B_FALSE;
        int error;
        nvlist_t *nvroot;
@@ -5402,7 +5954,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
        (void) nvlist_lookup_uint64(props,
            zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
        if (readonly)
-               mode = FREAD;
+               mode = SPA_MODE_READ;
        spa = spa_add(pool, config, altroot);
        spa->spa_import_flags = flags;
 
@@ -5544,10 +6096,10 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 
        spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
-       zvol_create_minors(spa, pool, B_TRUE);
-
        mutex_exit(&spa_namespace_lock);
 
+       zvol_create_minors_recursive(pool);
+
        return (0);
 }
 
@@ -5572,7 +6124,7 @@ spa_tryimport(nvlist_t *tryconfig)
         */
        mutex_enter(&spa_namespace_lock);
        spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
-       spa_activate(spa, FREAD);
+       spa_activate(spa, SPA_MODE_READ);
 
        /*
         * Rewind pool if a max txg was provided.
@@ -5682,7 +6234,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
        if (oldconfig)
                *oldconfig = NULL;
 
-       if (!(spa_mode_global & FWRITE))
+       if (!(spa_mode_global & SPA_MODE_WRITE))
                return (SET_ERROR(EROFS));
 
        mutex_enter(&spa_namespace_lock);
@@ -5691,8 +6243,15 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
                return (SET_ERROR(ENOENT));
        }
 
-       /*
-        * Put a hold on the pool, drop the namespace lock, stop async tasks,
+       if (spa->spa_is_exporting) {
+               /* the pool is being exported by another thread */
+               mutex_exit(&spa_namespace_lock);
+               return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
+       }
+       spa->spa_is_exporting = B_TRUE;
+
+       /*
+        * Put a hold on the pool, drop the namespace lock, stop async tasks,
         * reacquire the namespace lock, and see if we can export.
         */
        spa_open_ref(spa, FTAG);
@@ -5726,6 +6285,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
            (spa->spa_inject_ref != 0 &&
            new_state != POOL_STATE_UNINITIALIZED)) {
                spa_async_resume(spa);
+               spa->spa_is_exporting = B_FALSE;
                mutex_exit(&spa_namespace_lock);
                return (SET_ERROR(EBUSY));
        }
@@ -5740,20 +6300,24 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
                if (!force && new_state == POOL_STATE_EXPORTED &&
                    spa_has_active_shared_spare(spa)) {
                        spa_async_resume(spa);
+                       spa->spa_is_exporting = B_FALSE;
                        mutex_exit(&spa_namespace_lock);
                        return (SET_ERROR(EXDEV));
                }
 
                /*
                 * We're about to export or destroy this pool. Make sure
-                * we stop all initializtion activity here before we
-                * set the spa_final_txg. This will ensure that all
+                * we stop all initialization and trim activity here before
+                * we set the spa_final_txg. This will ensure that all
                 * dirty data resulting from the initialization is
                 * committed to disk before we unload the pool.
                 */
                if (spa->spa_root_vdev != NULL) {
-                       vdev_initialize_stop_all(spa->spa_root_vdev,
-                           VDEV_INITIALIZE_ACTIVE);
+                       vdev_t *rvd = spa->spa_root_vdev;
+                       vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+                       vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+                       vdev_autotrim_stop_all(spa);
+                       vdev_rebuild_stop_all(spa);
                }
 
                /*
@@ -5789,9 +6353,16 @@ export_spa:
                if (!hardforce)
                        spa_write_cachefile(spa, B_TRUE, B_TRUE);
                spa_remove(spa);
+       } else {
+               /*
+                * If spa_remove() is not called for this spa_t and
+                * there is any possibility that it can be reused,
+                * we make sure to reset the exporting flag.
+                */
+               spa->spa_is_exporting = B_FALSE;
        }
-       mutex_exit(&spa_namespace_lock);
 
+       mutex_exit(&spa_namespace_lock);
        return (0);
 }
 
@@ -5839,7 +6410,7 @@ spa_reset(char *pool)
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
-       uint64_t txg, id;
+       uint64_t txg;
        int error;
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *vd, *tvd;
@@ -5914,19 +6485,9 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
        }
 
        for (int c = 0; c < vd->vdev_children; c++) {
-
-               /*
-                * Set the vdev id to the first hole, if one exists.
-                */
-               for (id = 0; id < rvd->vdev_children; id++) {
-                       if (rvd->vdev_child[id]->vdev_ishole) {
-                               vdev_free(rvd->vdev_child[id]);
-                               break;
-                       }
-               }
                tvd = vd->vdev_child[c];
                vdev_remove_child(vd, tvd);
-               tvd->vdev_id = id;
+               tvd->vdev_id = rvd->vdev_children;
                vdev_add_child(rvd, tvd);
                vdev_config_dirty(tvd);
        }
@@ -5980,12 +6541,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
+ *
+ * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
+ * should be performed instead of traditional healing reconstruction.  From
+ * an administrators perspective these are both resilver operations.
  */
 int
-spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
+    int rebuild)
 {
        uint64_t txg, dtl_max_txg;
-       ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
+       vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
        vdev_ops_t *pvops;
        char *oldvdpath, *newvdpath;
@@ -6005,6 +6571,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                return (spa_vdev_exit(spa, NULL, txg, error));
        }
 
+       if (rebuild) {
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+                       return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+               if (dsl_scan_resilvering(spa_get_dsl(spa)))
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_RESILVER_IN_PROGRESS));
+       } else {
+               if (vdev_rebuild_active(rvd))
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_REBUILD_IN_PROGRESS));
+       }
+
        if (spa->spa_vdev_removal != NULL)
                return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
@@ -6037,6 +6616,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
                return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
+       if (rebuild) {
+               /*
+                * For rebuilds, the parent vdev must support reconstruction
+                * using only space maps.  This means the only allowable
+                * parents are the root vdev or a mirror vdev.
+                */
+               if (pvd->vdev_ops != &vdev_mirror_ops &&
+                   pvd->vdev_ops != &vdev_root_ops) {
+                       return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+               }
+       }
+
        if (!replacing) {
                /*
                 * For attach, the only allowable parent is a mirror or the root
@@ -6090,7 +6681,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
         * than the top-level vdev.
         */
        if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-               return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+               return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
        /*
         * If this is an in-place replacement, update oldvd's path and devid
@@ -6100,17 +6691,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                spa_strfree(oldvd->vdev_path);
                oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
                    KM_SLEEP);
-               (void) sprintf(oldvd->vdev_path, "%s/%s",
-                   newvd->vdev_path, "old");
+               (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
+                   "%s/%s", newvd->vdev_path, "old");
                if (oldvd->vdev_devid != NULL) {
                        spa_strfree(oldvd->vdev_devid);
                        oldvd->vdev_devid = NULL;
                }
        }
 
-       /* mark the device being resilvered */
-       newvd->vdev_resilver_txg = txg;
-
        /*
         * If the parent is not a mirror, or if we're replacing, insert the new
         * mirror/replacing/spare vdev above oldvd.
@@ -6148,8 +6736,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
         */
        dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-       vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
-           dtl_max_txg - TXG_INITIAL);
+       vdev_dtl_dirty(newvd, DTL_MISSING,
+           TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
 
        if (newvd->vdev_isspare) {
                spa_spare_activate(newvd);
@@ -6166,16 +6754,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
        /*
-        * Schedule the resilver to restart in the future. We do this to
-        * ensure that dmu_sync-ed blocks have been stitched into the
-        * respective datasets. We do not do this if resilvers have been
-        * deferred.
+        * Schedule the resilver or rebuild to restart in the future. We do
+        * this to ensure that dmu_sync-ed blocks have been stitched into the
+        * respective datasets.
         */
-       if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
-           spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
-               vdev_set_deferred_resilver(spa, newvd);
-       else
-               dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+       if (rebuild) {
+               newvd->vdev_rebuild_txg = txg;
+
+               vdev_rebuild(tvd);
+       } else {
+               newvd->vdev_resilver_txg = txg;
+
+               if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+                   spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
+                       vdev_defer_resilver(newvd);
+               } else {
+                       dsl_scan_restart_resilver(spa->spa_dsl_pool,
+                           dtl_max_txg);
+               }
+       }
 
        if (spa->spa_bootfs)
                spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6210,7 +6807,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
        uint64_t txg;
        int error;
-       ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
+       vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
        vdev_t *vd, *pvd, *cvd, *tvd;
        boolean_t unspare = B_FALSE;
        uint64_t unspare_guid = 0;
@@ -6218,7 +6815,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
        ASSERT(spa_writeable(spa));
 
-       txg = spa_vdev_enter(spa);
+       txg = spa_vdev_detach_enter(spa, guid);
 
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -6376,7 +6973,6 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
                vdev_remove_parent(cvd);
        }
 
-
        /*
         * We don't set tvd until now because the parent we just removed
         * may have been the previous top-level vdev.
@@ -6416,6 +7012,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
        vdev_dirty(tvd, VDD_DTL, vd, txg);
 
        spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
+       spa_notify_waiters(spa);
 
        /* hang on to the spa before we release the lock */
        spa_open_ref(spa, FTAG);
@@ -6490,7 +7087,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
         * a previous initialization process which has completed but
         * the thread is not exited.
         */
-       if (cmd_type == POOL_INITIALIZE_DO &&
+       if (cmd_type == POOL_INITIALIZE_START &&
            (vd->vdev_initialize_thread != NULL ||
            vd->vdev_top->vdev_removing)) {
                mutex_exit(&vd->vdev_initialize_lock);
@@ -6507,7 +7104,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
        }
 
        switch (cmd_type) {
-       case POOL_INITIALIZE_DO:
+       case POOL_INITIALIZE_START:
                vdev_initialize(vd);
                break;
        case POOL_INITIALIZE_CANCEL:
@@ -6571,6 +7168,126 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
        return (total_errors);
 }
 
+static int
+spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+    uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
+{
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+       /* Look up vdev and ensure it's a leaf. */
+       vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+       if (vd == NULL || vd->vdev_detached) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(ENODEV));
+       } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EINVAL));
+       } else if (!vdev_writeable(vd)) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EROFS));
+       } else if (!vd->vdev_has_trim) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EOPNOTSUPP));
+       } else if (secure && !vd->vdev_has_securetrim) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EOPNOTSUPP));
+       }
+       mutex_enter(&vd->vdev_trim_lock);
+       spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+       /*
+        * When we activate a TRIM action we check to see if the
+        * vdev_trim_thread is NULL. We do this instead of using the
+        * vdev_trim_state since there might be a previous TRIM process
+        * which has completed but the thread is not exited.
+        */
+       if (cmd_type == POOL_TRIM_START &&
+           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+               mutex_exit(&vd->vdev_trim_lock);
+               return (SET_ERROR(EBUSY));
+       } else if (cmd_type == POOL_TRIM_CANCEL &&
+           (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
+           vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
+               mutex_exit(&vd->vdev_trim_lock);
+               return (SET_ERROR(ESRCH));
+       } else if (cmd_type == POOL_TRIM_SUSPEND &&
+           vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
+               mutex_exit(&vd->vdev_trim_lock);
+               return (SET_ERROR(ESRCH));
+       }
+
+       switch (cmd_type) {
+       case POOL_TRIM_START:
+               vdev_trim(vd, rate, partial, secure);
+               break;
+       case POOL_TRIM_CANCEL:
+               vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
+               break;
+       case POOL_TRIM_SUSPEND:
+               vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
+               break;
+       default:
+               panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+       }
+       mutex_exit(&vd->vdev_trim_lock);
+
+       return (0);
+}
+
+/*
+ * Initiates a manual TRIM for the requested vdevs. This kicks off individual
+ * TRIM threads for each child vdev.  These threads pass over all of the free
+ * space in the vdev's metaslabs and issues TRIM commands for that space.
+ */
+int
+spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
+    boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
+{
+       int total_errors = 0;
+       list_t vd_list;
+
+       list_create(&vd_list, sizeof (vdev_t),
+           offsetof(vdev_t, vdev_trim_node));
+
+       /*
+        * We hold the namespace lock through the whole function
+        * to prevent any changes to the pool while we're starting or
+        * stopping TRIM. The config and state locks are held so that
+        * we can properly assess the vdev state before we commit to
+        * the TRIM operation.
+        */
+       mutex_enter(&spa_namespace_lock);
+
+       for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+               uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+               int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
+                   rate, partial, secure, &vd_list);
+               if (error != 0) {
+                       char guid_as_str[MAXNAMELEN];
+
+                       (void) snprintf(guid_as_str, sizeof (guid_as_str),
+                           "%llu", (unsigned long long)vdev_guid);
+                       fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+                       total_errors++;
+               }
+       }
+
+       /* Wait for all TRIM threads to stop. */
+       vdev_trim_stop_wait(spa, &vd_list);
+
+       /* Sync out the TRIM state */
+       txg_wait_synced(spa->spa_dsl_pool, 0);
+       mutex_exit(&spa_namespace_lock);
+
+       list_destroy(&vd_list);
+
+       return (total_errors);
+}
+
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
@@ -6630,7 +7347,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
                vdev_t *vd = rvd->vdev_child[c];
 
                /* don't count the holes & logs as children */
-               if (vd->vdev_islog || !vdev_is_concrete(vd)) {
+               if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
+                   !vdev_is_concrete(vd))) {
                        if (lastlog == 0)
                                lastlog = c;
                        continue;
@@ -6666,6 +7384,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
                        }
                }
 
+               /* deal with indirect vdevs */
+               if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
+                   &vdev_indirect_ops)
+                       continue;
+
                /* which disk is going to be split? */
                if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
                    &glist[c]) != 0) {
@@ -6780,26 +7503,39 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
        spa_async_suspend(newspa);
 
        /*
-        * Temporarily stop the initializing activity. We set the state to
-        * ACTIVE so that we know to resume the initializing once the split
-        * has completed.
+        * Temporarily stop the initializing and TRIM activity.  We set the
+        * state to ACTIVE so that we know to resume initializing or TRIM
+        * once the split has completed.
         */
-       list_t vd_list;
-       list_create(&vd_list, sizeof (vdev_t),
+       list_t vd_initialize_list;
+       list_create(&vd_initialize_list, sizeof (vdev_t),
            offsetof(vdev_t, vdev_initialize_node));
 
+       list_t vd_trim_list;
+       list_create(&vd_trim_list, sizeof (vdev_t),
+           offsetof(vdev_t, vdev_trim_node));
+
        for (c = 0; c < children; c++) {
-               if (vml[c] != NULL) {
+               if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
                        mutex_enter(&vml[c]->vdev_initialize_lock);
-                       vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE,
-                           &vd_list);
+                       vdev_initialize_stop(vml[c],
+                           VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
                        mutex_exit(&vml[c]->vdev_initialize_lock);
+
+                       mutex_enter(&vml[c]->vdev_trim_lock);
+                       vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
+                       mutex_exit(&vml[c]->vdev_trim_lock);
                }
        }
-       vdev_initialize_stop_wait(spa, &vd_list);
-       list_destroy(&vd_list);
+
+       vdev_initialize_stop_wait(spa, &vd_initialize_list);
+       vdev_trim_stop_wait(spa, &vd_trim_list);
+
+       list_destroy(&vd_initialize_list);
+       list_destroy(&vd_trim_list);
 
        newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+       newspa->spa_is_splitting = B_TRUE;
 
        /* create the new pool from the disks of the original pool */
        error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
@@ -6841,7 +7577,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
        if (error != 0)
                dmu_tx_abort(tx);
        for (c = 0; c < children; c++) {
-               if (vml[c] != NULL) {
+               if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
                        vdev_t *tvd = vml[c]->vdev_top;
 
                        /*
@@ -6877,6 +7613,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
        spa_history_log_internal(newspa, "split", NULL,
            "from pool %s", spa_name(spa));
 
+       newspa->spa_is_splitting = B_FALSE;
        kmem_free(vml, children * sizeof (vdev_t *));
 
        /* if we're not going to mount the filesystems in userland, export */
@@ -6899,8 +7636,10 @@ out:
                        vml[c]->vdev_offline = B_FALSE;
        }
 
-       /* restart initializing disks as necessary */
+       /* restart initializing or trimming disks as necessary */
        spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+       spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+       spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
        vdev_reopen(spa->spa_root_vdev);
 
@@ -7030,12 +7769,18 @@ spa_vdev_resilver_done(spa_t *spa)
        }
 
        spa_config_exit(spa, SCL_ALL, FTAG);
+
+       /*
+        * If a detach was not performed above replace waiters will not have
+        * been notified.  In which case we must do so now.
+        */
+       spa_notify_waiters(spa);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
-int
+static int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
@@ -7117,6 +7862,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
        if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
                return (SET_ERROR(ENOTSUP));
 
+       if (func == POOL_SCAN_RESILVER &&
+           !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+               return (SET_ERROR(ENOTSUP));
+
        /*
         * If a resilver was requested, but there is no DTL on a
         * writeable leaf device, we have nothing to do.
@@ -7229,7 +7978,8 @@ spa_async_thread(void *arg)
                if (new_space != old_space) {
                        spa_history_log_internal(spa, "vdev online", NULL,
                            "pool '%s' size: %llu(+%llu)",
-                           spa_name(spa), new_space, new_space - old_space);
+                           spa_name(spa), (u_longlong_t)new_space,
+                           (u_longlong_t)(new_space - old_space));
                }
        }
 
@@ -7267,13 +8017,25 @@ spa_async_thread(void *arg)
        if (tasks & SPA_ASYNC_RESILVER_DONE)
                spa_vdev_resilver_done(spa);
 
+       /*
+        * If any devices are done replacing, detach them.  Then if no
+        * top-level vdevs are rebuilding attempt to kick off a scrub.
+        */
+       if (tasks & SPA_ASYNC_REBUILD_DONE) {
+               spa_vdev_resilver_done(spa);
+
+               if (!vdev_rebuild_active(spa->spa_root_vdev))
+                       (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
+       }
+
        /*
         * Kick off a resilver.
         */
        if (tasks & SPA_ASYNC_RESILVER &&
+           !vdev_rebuild_active(spa->spa_root_vdev) &&
            (!dsl_scan_resilvering(dp) ||
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
-               dsl_resilver_restart(dp, 0);
+               dsl_scan_restart_resilver(dp, 0);
 
        if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
                mutex_enter(&spa_namespace_lock);
@@ -7283,6 +8045,44 @@ spa_async_thread(void *arg)
                mutex_exit(&spa_namespace_lock);
        }
 
+       if (tasks & SPA_ASYNC_TRIM_RESTART) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               vdev_trim_restart(spa->spa_root_vdev);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               mutex_exit(&spa_namespace_lock);
+       }
+
+       if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               vdev_autotrim_restart(spa);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               mutex_exit(&spa_namespace_lock);
+       }
+
+       /*
+        * Kick off L2 cache whole device TRIM.
+        */
+       if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               vdev_trim_l2arc(spa);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               mutex_exit(&spa_namespace_lock);
+       }
+
+       /*
+        * Kick off L2 cache rebuilding.
+        */
+       if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
+               l2arc_spa_rebuild_start(spa);
+               spa_config_exit(spa, SCL_L2ARC, FTAG);
+               mutex_exit(&spa_namespace_lock);
+       }
+
        /*
         * Let the world know that we're done.
         */
@@ -7311,6 +8111,14 @@ spa_async_suspend(spa_t *spa)
        zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
        if (discard_thread != NULL)
                zthr_cancel(discard_thread);
+
+       zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+       if (ll_delete_thread != NULL)
+               zthr_cancel(ll_delete_thread);
+
+       zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+       if (ll_condense_thread != NULL)
+               zthr_cancel(ll_condense_thread);
 }
 
 void
@@ -7329,6 +8137,14 @@ spa_async_resume(spa_t *spa)
        zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
        if (discard_thread != NULL)
                zthr_resume(discard_thread);
+
+       zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+       if (ll_delete_thread != NULL)
+               zthr_resume(ll_delete_thread);
+
+       zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+       if (ll_condense_thread != NULL)
+               zthr_resume(ll_condense_thread);
 }
 
 static boolean_t
@@ -7357,8 +8173,7 @@ spa_async_dispatch(spa_t *spa)
        mutex_enter(&spa->spa_async_lock);
        if (spa_async_tasks_pending(spa) &&
            !spa->spa_async_suspended &&
-           spa->spa_async_thread == NULL &&
-           rootdir != NULL)
+           spa->spa_async_thread == NULL)
                spa->spa_async_thread = thread_create(NULL, 0,
                    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
        mutex_exit(&spa->spa_async_lock);
@@ -7373,30 +8188,58 @@ spa_async_request(spa_t *spa, int task)
        mutex_exit(&spa->spa_async_lock);
 }
 
+int
+spa_async_tasks(spa_t *spa)
+{
+       return (spa->spa_async_tasks);
+}
+
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
+
 static int
-bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
 {
        bpobj_t *bpo = arg;
-       bpobj_enqueue(bpo, bp, tx);
+       bpobj_enqueue(bpo, bp, bp_freed, tx);
        return (0);
 }
 
+int
+bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
+}
+
+int
+bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
+}
+
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
-       zio_t *zio = arg;
+       zio_t *pio = arg;
 
-       zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
-           zio->io_flags));
+       zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
+           pio->io_flags));
        return (0);
 }
 
+static int
+bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+       ASSERT(!bp_freed);
+       return (spa_free_sync_cb(arg, bp, tx));
+}
+
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
@@ -7419,9 +8262,21 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
        if (spa_sync_pass(spa) != 1)
                return;
 
+       /*
+        * Note:
+        * If the log space map feature is active, we stop deferring
+        * frees to the next TXG and therefore running this function
+        * would be considered a no-op as spa_deferred_bpobj should
+        * not have any entries.
+        *
+        * That said we run this function anyway (instead of returning
+        * immediately) for the edge-case scenario where we just
+        * activated the log space map feature in this TXG but we have
+        * deferred frees from the previous TXG.
+        */
        zio_t *zio = zio_root(spa, NULL, NULL, 0);
        VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
-           spa_free_sync_cb, zio, tx), ==, 0);
+           bpobj_spa_free_sync_cb, zio, tx), ==, 0);
        VERIFY0(zio_wait(zio));
 }
 
@@ -7653,7 +8508,8 @@ spa_sync_version(void *arg, dmu_tx_t *tx)
 
        spa->spa_uberblock.ub_version = version;
        vdev_config_dirty(spa->spa_root_vdev);
-       spa_history_log_internal(spa, "set", tx, "version=%lld", version);
+       spa_history_log_internal(spa, "set", tx, "version=%lld",
+           (longlong_t)version);
 }
 
 /*
@@ -7712,7 +8568,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                case ZPOOL_PROP_READONLY:
                case ZPOOL_PROP_CACHEFILE:
                        /*
-                        * 'readonly' and 'cachefile' are also non-persisitent
+                        * 'readonly' and 'cachefile' are also non-persistent
                         * properties.
                         */
                        break;
@@ -7767,7 +8623,8 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                                    spa->spa_pool_props_object, propname,
                                    8, 1, &intval, tx));
                                spa_history_log_internal(spa, "set", tx,
-                                   "%s=%lld", nvpair_name(elem), intval);
+                                   "%s=%lld", nvpair_name(elem),
+                                   (longlong_t)intval);
                        } else {
                                ASSERT(0); /* not allowed */
                        }
@@ -7782,6 +8639,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                        case ZPOOL_PROP_FAILUREMODE:
                                spa->spa_failmode = intval;
                                break;
+                       case ZPOOL_PROP_AUTOTRIM:
+                               spa->spa_autotrim = intval;
+                               spa_async_request(spa,
+                                   SPA_ASYNC_AUTOTRIM_RESTART);
+                               break;
                        case ZPOOL_PROP_AUTOEXPAND:
                                spa->spa_autoexpand = intval;
                                if (tx->tx_txg != TXG_INITIAL)
@@ -7791,9 +8653,6 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                        case ZPOOL_PROP_MULTIHOST:
                                spa->spa_multihost = intval;
                                break;
-                       case ZPOOL_PROP_DEDUPDITTO:
-                               spa->spa_dedup_ditto = intval;
-                               break;
                        default:
                                break;
                        }
@@ -7881,8 +8740,8 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 static void
 vdev_indirect_state_sync_verify(vdev_t *vd)
 {
-       ASSERTV(vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping);
-       ASSERTV(vdev_indirect_births_t *vib = vd->vdev_indirect_births);
+       vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
+       vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
 
        if (vd->vdev_ops == &vdev_indirect_ops) {
                ASSERT(vim != NULL);
@@ -7946,13 +8805,14 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
                 * allocations look at mg_max_alloc_queue_depth, and async
                 * allocations all happen from spa_sync().
                 */
-               for (int i = 0; i < spa->spa_alloc_count; i++)
+               for (int i = 0; i < mg->mg_allocators; i++) {
                        ASSERT0(zfs_refcount_count(
-                           &(mg->mg_alloc_queue_depth[i])));
+                           &(mg->mg_allocator[i].mga_alloc_queue_depth)));
+               }
                mg->mg_max_alloc_queue_depth = max_queue_depth;
 
-               for (int i = 0; i < spa->spa_alloc_count; i++) {
-                       mg->mg_cur_max_alloc_queue_depth[i] =
+               for (int i = 0; i < mg->mg_allocators; i++) {
+                       mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
                            zfs_vdev_def_queue_depth;
                }
                slots_per_allocator += zfs_vdev_def_queue_depth;
@@ -8007,7 +8867,14 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                spa_errlog_sync(spa, txg);
                dsl_pool_sync(dp, txg);
 
-               if (pass < zfs_sync_pass_deferred_free) {
+               if (pass < zfs_sync_pass_deferred_free ||
+                   spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+                       /*
+                        * If the log space map feature is active we don't
+                        * care about deferred frees and the deferred bpobj
+                        * as the log space map should effectively have the
+                        * same results (i.e. appending only to one object).
+                        */
                        spa_sync_frees(spa, free_bpl, tx);
                } else {
                        /*
@@ -8015,7 +8882,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                         * we sync the deferred frees later in pass 1.
                         */
                        ASSERT3U(pass, >, 1);
-                       bplist_iterate(free_bpl, bpobj_enqueue_cb,
+                       bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
                            &spa->spa_deferred_bpobj, tx);
                }
 
@@ -8024,6 +8891,8 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                svr_sync(spa, tx);
                spa_sync_upgrades(spa, tx);
 
+               spa_flush_metaslabs(spa, tx);
+
                vdev_t *vd = NULL;
                while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
                    != NULL)
@@ -8274,6 +9143,11 @@ spa_sync(spa_t *spa, uint64_t txg)
            != NULL)
                vdev_sync_done(vd, txg);
 
+       metaslab_class_evict_old(spa->spa_normal_class, txg);
+       metaslab_class_evict_old(spa->spa_log_class, txg);
+
+       spa_sync_close_syncing_log_sm(spa);
+
        spa_update_dspace(spa);
 
        /*
@@ -8459,6 +9333,308 @@ spa_has_active_shared_spare(spa_t *spa)
        return (B_FALSE);
 }
 
+uint64_t
+spa_total_metaslabs(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       uint64_t m = 0;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+               if (!vdev_is_concrete(vd))
+                       continue;
+               m += vd->vdev_ms_count;
+       }
+       return (m);
+}
+
+/*
+ * Notify any waiting threads that some activity has switched from being in-
+ * progress to not-in-progress so that the thread can wake up and determine
+ * whether it is finished waiting.
+ */
+void
+spa_notify_waiters(spa_t *spa)
+{
+       /*
+        * Acquiring spa_activities_lock here prevents the cv_broadcast from
+        * happening between the waiting thread's check and cv_wait.
+        */
+       mutex_enter(&spa->spa_activities_lock);
+       cv_broadcast(&spa->spa_activities_cv);
+       mutex_exit(&spa->spa_activities_lock);
+}
+
+/*
+ * Notify any waiting threads that the pool is exporting, and then block until
+ * they are finished using the spa_t.
+ */
+void
+spa_wake_waiters(spa_t *spa)
+{
+       mutex_enter(&spa->spa_activities_lock);
+       spa->spa_waiters_cancel = B_TRUE;
+       cv_broadcast(&spa->spa_activities_cv);
+       while (spa->spa_waiters != 0)
+               cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
+       spa->spa_waiters_cancel = B_FALSE;
+       mutex_exit(&spa->spa_activities_lock);
+}
+
+/* Whether the vdev or any of its descendants are being initialized/trimmed. */
+static boolean_t
+spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
+       ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+       ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
+           activity == ZPOOL_WAIT_TRIM);
+
+       kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
+           &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
+
+       mutex_exit(&spa->spa_activities_lock);
+       mutex_enter(lock);
+       mutex_enter(&spa->spa_activities_lock);
+
+       boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
+           (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
+           (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
+       mutex_exit(lock);
+
+       if (in_progress)
+               return (B_TRUE);
+
+       for (int i = 0; i < vd->vdev_children; i++) {
+               if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
+                   activity))
+                       return (B_TRUE);
+       }
+
+       return (B_FALSE);
+}
+
+/*
+ * If use_guid is true, this checks whether the vdev specified by guid is
+ * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
+ * is being initialized/trimmed. The caller must hold the config lock and
+ * spa_activities_lock.
+ */
+static int
+spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
+    zpool_wait_activity_t activity, boolean_t *in_progress)
+{
+       mutex_exit(&spa->spa_activities_lock);
+       spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+       mutex_enter(&spa->spa_activities_lock);
+
+       vdev_t *vd;
+       if (use_guid) {
+               vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+               if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
+                       spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+                       return (EINVAL);
+               }
+       } else {
+               vd = spa->spa_root_vdev;
+       }
+
+       *in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
+
+       spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+       return (0);
+}
+
+/*
+ * Locking for waiting threads
+ * ---------------------------
+ *
+ * Waiting threads need a way to check whether a given activity is in progress,
+ * and then, if it is, wait for it to complete. Each activity will have some
+ * in-memory representation of the relevant on-disk state which can be used to
+ * determine whether or not the activity is in progress. The in-memory state and
+ * the locking used to protect it will be different for each activity, and may
+ * not be suitable for use with a cvar (e.g., some state is protected by the
+ * config lock). To allow waiting threads to wait without any races, another
+ * lock, spa_activities_lock, is used.
+ *
+ * When the state is checked, both the activity-specific lock (if there is one)
+ * and spa_activities_lock are held. In some cases, the activity-specific lock
+ * is acquired explicitly (e.g. the config lock). In others, the locking is
+ * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
+ * thread releases the activity-specific lock and, if the activity is in
+ * progress, then cv_waits using spa_activities_lock.
+ *
+ * The waiting thread is woken when another thread, one completing some
+ * activity, updates the state of the activity and then calls
+ * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
+ * needs to hold its activity-specific lock when updating the state, and this
+ * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
+ *
+ * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
+ * and because it is held when the waiting thread checks the state of the
+ * activity, it can never be the case that the completing thread both updates
+ * the activity state and cv_broadcasts in between the waiting thread's check
+ * and cv_wait. Thus, a waiting thread can never miss a wakeup.
+ *
+ * In order to prevent deadlock, when the waiting thread does its check, in some
+ * cases it will temporarily drop spa_activities_lock in order to acquire the
+ * activity-specific lock. The order in which spa_activities_lock and the
+ * activity specific lock are acquired in the waiting thread is determined by
+ * the order in which they are acquired in the completing thread; if the
+ * completing thread calls spa_notify_waiters with the activity-specific lock
+ * held, then the waiting thread must also acquire the activity-specific lock
+ * first.
+ */
+
+static int
+spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
+    boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
+{
+       int error = 0;
+
+       ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+
+       switch (activity) {
+       case ZPOOL_WAIT_CKPT_DISCARD:
+               *in_progress =
+                   (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
+                   zap_contains(spa_meta_objset(spa),
+                   DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
+                   ENOENT);
+               break;
+       case ZPOOL_WAIT_FREE:
+               *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
+                   !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
+                   spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
+                   spa_livelist_delete_check(spa));
+               break;
+       case ZPOOL_WAIT_INITIALIZE:
+       case ZPOOL_WAIT_TRIM:
+               error = spa_vdev_activity_in_progress(spa, use_tag, tag,
+                   activity, in_progress);
+               break;
+       case ZPOOL_WAIT_REPLACE:
+               mutex_exit(&spa->spa_activities_lock);
+               spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+               mutex_enter(&spa->spa_activities_lock);
+
+               *in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               break;
+       case ZPOOL_WAIT_REMOVE:
+               *in_progress = (spa->spa_removing_phys.sr_state ==
+                   DSS_SCANNING);
+               break;
+       case ZPOOL_WAIT_RESILVER:
+               if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+                       break;
+               /* fall through */
+       case ZPOOL_WAIT_SCRUB:
+       {
+               boolean_t scanning, paused, is_scrub;
+               dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
+
+               is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
+               scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
+               paused = dsl_scan_is_paused_scrub(scn);
+               *in_progress = (scanning && !paused &&
+                   is_scrub == (activity == ZPOOL_WAIT_SCRUB));
+               break;
+       }
+       default:
+               panic("unrecognized value for activity %d", activity);
+       }
+
+       return (error);
+}
+
+static int
+spa_wait_common(const char *pool, zpool_wait_activity_t activity,
+    boolean_t use_tag, uint64_t tag, boolean_t *waited)
+{
+       /*
+        * The tag is used to distinguish between instances of an activity.
+        * 'initialize' and 'trim' are the only activities that we use this for.
+        * The other activities can only have a single instance in progress in a
+        * pool at one time, making the tag unnecessary.
+        *
+        * There can be multiple devices being replaced at once, but since they
+        * all finish once resilvering finishes, we don't bother keeping track
+        * of them individually, we just wait for them all to finish.
+        */
+       if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
+           activity != ZPOOL_WAIT_TRIM)
+               return (EINVAL);
+
+       if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
+               return (EINVAL);
+
+       spa_t *spa;
+       int error = spa_open(pool, &spa, FTAG);
+       if (error != 0)
+               return (error);
+
+       /*
+        * Increment the spa's waiter count so that we can call spa_close and
+        * still ensure that the spa_t doesn't get freed before this thread is
+        * finished with it when the pool is exported. We want to call spa_close
+        * before we start waiting because otherwise the additional ref would
+        * prevent the pool from being exported or destroyed throughout the
+        * potentially long wait.
+        */
+       mutex_enter(&spa->spa_activities_lock);
+       spa->spa_waiters++;
+       spa_close(spa, FTAG);
+
+       *waited = B_FALSE;
+       for (;;) {
+               boolean_t in_progress;
+               error = spa_activity_in_progress(spa, activity, use_tag, tag,
+                   &in_progress);
+
+               if (error || !in_progress || spa->spa_waiters_cancel)
+                       break;
+
+               *waited = B_TRUE;
+
+               if (cv_wait_sig(&spa->spa_activities_cv,
+                   &spa->spa_activities_lock) == 0) {
+                       error = EINTR;
+                       break;
+               }
+       }
+
+       spa->spa_waiters--;
+       cv_signal(&spa->spa_waiters_cv);
+       mutex_exit(&spa->spa_activities_lock);
+
+       return (error);
+}
+
+/*
+ * Wait for a particular instance of the specified activity to complete, where
+ * the instance is identified by 'tag'
+ */
+int
+spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
+    boolean_t *waited)
+{
+       return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
+}
+
+/*
+ * Wait for all instances of the specified activity complete
+ */
+int
+spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
+{
+
+       return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
+}
+
 sysevent_t *
 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
@@ -8499,7 +9675,6 @@ spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
        spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
 
-#if defined(_KERNEL)
 /* state manipulation functions */
 EXPORT_SYMBOL(spa_open);
 EXPORT_SYMBOL(spa_open_rewind);
@@ -8518,7 +9693,7 @@ EXPORT_SYMBOL(spa_inject_delref);
 EXPORT_SYMBOL(spa_scan_stat_init);
 EXPORT_SYMBOL(spa_scan_get_stats);
 
-/* device maniion */
+/* device manipulation */
 EXPORT_SYMBOL(spa_vdev_add);
 EXPORT_SYMBOL(spa_vdev_attach);
 EXPORT_SYMBOL(spa_vdev_detach);
@@ -8554,35 +9729,41 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
 
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
-#endif
 
-#if defined(_KERNEL)
-module_param(spa_load_verify_maxinflight, int, 0644);
-MODULE_PARM_DESC(spa_load_verify_maxinflight,
-       "Max concurrent traversal I/Os while verifying pool during import -X");
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
+       "log2(fraction of arc that can be used by inflight I/Os when "
+       "verifying pool during import");
 
-module_param(spa_load_verify_metadata, int, 0644);
-MODULE_PARM_DESC(spa_load_verify_metadata,
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
        "Set to traverse metadata on pool import");
 
-module_param(spa_load_verify_data, int, 0644);
-MODULE_PARM_DESC(spa_load_verify_data,
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
        "Set to traverse data on pool import");
 
-module_param(spa_load_print_vdev_tree, int, 0644);
-MODULE_PARM_DESC(spa_load_print_vdev_tree,
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
        "Print vdev tree to zfs_dbgmsg during pool import");
 
-/* CSTYLED */
-module_param(zio_taskq_batch_pct, uint, 0444);
-MODULE_PARM_DESC(zio_taskq_batch_pct,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
        "Percentage of CPUs to run an IO worker thread");
 
-/* BEGIN CSTYLED */
-module_param(zfs_max_missing_tvds, ulong, 0644);
-MODULE_PARM_DESC(zfs_max_missing_tvds,
-       "Allow importing pool with up to this number of missing top-level vdevs"
-       " (in read-only mode)");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
+       "Allow importing pool with up to this number of missing top-level "
+       "vdevs (in read-only mode)");
 
-#endif
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW,
+       "Set the livelist condense zthr to pause");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW,
+       "Set the livelist condense synctask to pause");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW,
+       "Whether livelist condensing was canceled in the synctask");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW,
+       "Whether livelist condensing was canceled in the zthr function");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
+       "Whether extra ALLOC blkptrs were added to a livelist entry while it "
+       "was being condensed");
+/* END CSTYLED */