]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/spa.c
Prefix all refcount functions with zfs_
[mirror_zfs.git] / module / zfs / spa.c
index 08fc7bbda71d97204c6b0f72bdddf158adce186f..eaca2d29e640870cc2d283bcce2007a60d98d21a 100644 (file)
@@ -22,8 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -31,6 +30,7 @@
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2017 Datto Inc.
  * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
  */
 
 /*
 #ifdef _KERNEL
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
-#include <sys/bootprops.h>
 #include <sys/callb.h>
-#include <sys/cpupart.h>
-#include <sys/pool.h>
-#include <sys/sysdc.h>
 #include <sys/zone.h>
 #endif /* _KERNEL */
 
@@ -157,24 +153,81 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
-    spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
-    char **ereport);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t         zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
-id_t           zio_taskq_psrset_bind = PS_NONE;
 boolean_t      zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 uint_t         zio_taskq_basedc = 80;          /* base duty cycle */
 
 boolean_t      spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 
+/*
+ * Report any spa_load_verify errors found, but do not fail spa_load.
+ * This is used by zdb to analyze non-idle pools.
+ */
+boolean_t      spa_load_verify_dryrun = B_FALSE;
+
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define        TRYIMPORT_NAME  "$import"
 
+/*
+ * For debugging purposes: print out vdev tree during pool import.
+ */
+int            spa_load_print_vdev_tree = B_FALSE;
+
+/*
+ * A non-zero value for zfs_max_missing_tvds means that we allow importing
+ * pools with missing top-level vdevs. This is strictly intended for advanced
+ * pool recovery cases since missing data is almost inevitable. Pools with
+ * missing devices can only be imported read-only for safety reasons, and their
+ * fail-mode will be automatically set to "continue".
+ *
+ * With 1 missing vdev we should be able to import the pool and mount all
+ * datasets. User data that was not modified after the missing device has been
+ * added should be recoverable. This means that snapshots created prior to the
+ * addition of that device should be completely intact.
+ *
+ * With 2 missing vdevs, some datasets may fail to mount since there are
+ * dataset statistics that are stored as regular metadata. Some data might be
+ * recoverable if those vdevs were added recently.
+ *
+ * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
+ * may be missing entirely. Chances of data recovery are very low. Note that
+ * there are also risks of performing an inadvertent rewind as we might be
+ * missing all the vdevs with the latest uberblocks.
+ */
+unsigned long  zfs_max_missing_tvds = 0;
+
+/*
+ * The parameters below are similar to zfs_max_missing_tvds but are only
+ * intended for a preliminary open of the pool with an untrusted config which
+ * might be incomplete or out-dated.
+ *
+ * We are more tolerant for pools opened from a cachefile since we could have
+ * an out-dated cachefile where a device removal was not registered.
+ * We could have set the limit arbitrarily high but in the case where devices
+ * are really missing we would want to return the proper error codes; we chose
+ * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
+ * and we get a chance to retrieve the trusted config.
+ */
+uint64_t       zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
+/*
+ * In the case where config was assembled by scanning device paths (/dev/dsks
+ * by default) we are less tolerant since all the existing devices should have
+ * been detected and we want spa_load to return the right error codes.
+ */
+uint64_t       zfs_max_missing_tvds_scan = 0;
+
+/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t      zfs_pause_spa_sync = B_FALSE;
+
 /*
  * ==========================================================================
  * SPA properties routines
@@ -219,13 +272,21 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
        ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
        if (rvd != NULL) {
-               alloc = metaslab_class_get_alloc(spa_normal_class(spa));
-               size = metaslab_class_get_space(spa_normal_class(spa));
+               alloc = metaslab_class_get_alloc(mc);
+               alloc += metaslab_class_get_alloc(spa_special_class(spa));
+               alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+               size = metaslab_class_get_space(mc);
+               size += metaslab_class_get_space(spa_special_class(spa));
+               size += metaslab_class_get_space(spa_dedup_class(spa));
+
                spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
                    size - alloc, src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+                   spa->spa_checkpoint_info.sci_dspace, src);
 
                spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
                    metaslab_class_fragmentation(mc), src);
@@ -251,6 +312,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                        spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
                            version, ZPROP_SRC_LOCAL);
                }
+               spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
+                   NULL, spa_load_guid(spa), src);
        }
 
        if (pool != NULL) {
@@ -763,6 +826,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx)
        vdev_t *rvd = spa->spa_root_vdev;
        uint64_t vdev_state;
 
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               int error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (SET_ERROR(error));
+       }
+
        spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
        vdev_state = rvd->vdev_state;
        spa_config_exit(spa, SCL_STATE, FTAG);
@@ -875,7 +944,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
        uint_t value = ztip->zti_value;
        uint_t count = ztip->zti_count;
        spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-       char name[32];
        uint_t flags = 0;
        boolean_t batch = B_FALSE;
 
@@ -912,14 +980,10 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 
        for (uint_t i = 0; i < count; i++) {
                taskq_t *tq;
+               char name[32];
 
-               if (count > 1) {
-                       (void) snprintf(name, sizeof (name), "%s_%s_%u",
-                           zio_type_name[t], zio_taskq_types[q], i);
-               } else {
-                       (void) snprintf(name, sizeof (name), "%s_%s",
-                           zio_type_name[t], zio_taskq_types[q]);
-               }
+               (void) snprintf(name, sizeof (name), "%s_%s",
+                   zio_type_name[t], zio_taskq_types[q]);
 
                if (zio_taskq_sysdc && spa->spa_proc != &p0) {
                        if (batch)
@@ -1035,6 +1099,7 @@ spa_create_zio_taskqs(spa_t *spa)
 static void
 spa_thread(void *arg)
 {
+       psetid_t zio_taskq_psrset_bind = PS_NONE;
        callb_cpr_t cprinfo;
 
        spa_t *spa = arg;
@@ -1114,6 +1179,8 @@ spa_activate(spa_t *spa, int mode)
 
        spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
        spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+       spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+       spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
        /* Try to create a covering process */
        mutex_enter(&spa->spa_proc_lock);
@@ -1261,6 +1328,12 @@ spa_deactivate(spa_t *spa)
        metaslab_class_destroy(spa->spa_log_class);
        spa->spa_log_class = NULL;
 
+       metaslab_class_destroy(spa->spa_special_class);
+       spa->spa_special_class = NULL;
+
+       metaslab_class_destroy(spa->spa_dedup_class);
+       spa->spa_dedup_class = NULL;
+
        /*
         * If this was part of an import or the open otherwise failed, we may
         * still have errors left in the queues.  Empty them just in case.
@@ -1356,6 +1429,8 @@ spa_unload(spa_t *spa)
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+       spa_load_note(spa, "UNLOADING");
+
        /*
         * Stop async tasks.
         */
@@ -1400,6 +1475,18 @@ spa_unload(spa_t *spa)
                spa->spa_vdev_removal = NULL;
        }
 
+       if (spa->spa_condense_zthr != NULL) {
+               ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
+               zthr_destroy(spa->spa_condense_zthr);
+               spa->spa_condense_zthr = NULL;
+       }
+
+       if (spa->spa_checkpoint_discard_zthr != NULL) {
+               ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
+               zthr_destroy(spa->spa_checkpoint_discard_zthr);
+               spa->spa_checkpoint_discard_zthr = NULL;
+       }
+
        spa_condense_fini(spa);
 
        bpobj_close(&spa->spa_deferred_bpobj);
@@ -1483,6 +1570,18 @@ spa_load_spares(spa_t *spa)
        int i;
        vdev_t *vd, *tvd;
 
+#ifndef _KERNEL
+       /*
+        * zdb opens both the current state of the pool and the
+        * checkpointed state (if present), with a different spa_t.
+        *
+        * As spare vdevs are shared among open pools, we skip loading
+        * them when we load the checkpointed state of the pool.
+        */
+       if (!spa_writeable(spa))
+               return;
+#endif
+
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
        /*
@@ -1602,6 +1701,19 @@ spa_load_l2cache(spa_t *spa)
        vdev_t *vd, **oldvdevs, **newvdevs;
        spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
+#ifndef _KERNEL
+       /*
+        * zdb opens both the current state of the pool and the
+        * checkpointed state (if present), with a different spa_t.
+        *
+        * As L2 caches are part of the ARC which is shared among open
+        * pools, we skip loading them when we load the checkpointed
+        * state of the pool.
+        */
+       if (!spa_writeable(spa))
+               return;
+#endif
+
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
        oldvdevs = sav->sav_vdevs;
@@ -1742,6 +1854,27 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
        return (error);
 }
 
+/*
+ * Concrete top-level vdevs that are not missing and are not logs. At every
+ * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
+ */
+static uint64_t
+spa_healthy_core_tvds(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t tvds = 0;
+
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *vd = rvd->vdev_child[i];
+               if (vd->vdev_islog)
+                       continue;
+               if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
+                       tvds++;
+       }
+
+       return (tvds);
+}
+
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
@@ -1749,7 +1882,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 static void
 spa_check_removed(vdev_t *vd)
 {
-       for (int c = 0; c < vd->vdev_children; c++)
+       for (uint64_t c = 0; c < vd->vdev_children; c++)
                spa_check_removed(vd->vdev_child[c]);
 
        if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
@@ -1759,38 +1892,14 @@ spa_check_removed(vdev_t *vd)
        }
 }
 
-static void
-spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
-{
-       ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
-
-       vd->vdev_top_zap = mvd->vdev_top_zap;
-       vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
-
-       for (uint64_t i = 0; i < vd->vdev_children; i++) {
-               spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
-       }
-}
-
-/*
- * Validate the current config against the MOS config
- */
-static boolean_t
-spa_config_valid(spa_t *spa, nvlist_t *config)
+static int
+spa_check_for_missing_logs(spa_t *spa)
 {
-       vdev_t *mrvd, *rvd = spa->spa_root_vdev;
-       nvlist_t *nv;
-
-       VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
-
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
-
-       ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
+       vdev_t *rvd = spa->spa_root_vdev;
 
        /*
         * If we're doing a normal import, then build up any additional
-        * diagnostic information about missing devices in this config.
+        * diagnostic information about missing log devices.
         * We'll pass this up to the user for further processing.
         */
        if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
@@ -1801,109 +1910,54 @@ spa_config_valid(spa_t *spa, nvlist_t *config)
                    KM_SLEEP);
                VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-               for (int c = 0; c < rvd->vdev_children; c++) {
+               for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                        vdev_t *tvd = rvd->vdev_child[c];
-                       vdev_t *mtvd  = mrvd->vdev_child[c];
 
-                       if (tvd->vdev_ops == &vdev_missing_ops &&
-                           mtvd->vdev_ops != &vdev_missing_ops &&
-                           mtvd->vdev_islog)
-                               child[idx++] = vdev_config_generate(spa, mtvd,
-                                   B_FALSE, 0);
+                       /*
+                        * We consider a device as missing only if it failed
+                        * to open (i.e. offline or faulted is not considered
+                        * as missing).
+                        */
+                       if (tvd->vdev_islog &&
+                           tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+                               child[idx++] = vdev_config_generate(spa, tvd,
+                                   B_FALSE, VDEV_CONFIG_MISSING);
+                       }
                }
 
-               if (idx) {
-                       VERIFY(nvlist_add_nvlist_array(nv,
-                           ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
-                       VERIFY(nvlist_add_nvlist(spa->spa_load_info,
-                           ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+               if (idx > 0) {
+                       fnvlist_add_nvlist_array(nv,
+                           ZPOOL_CONFIG_CHILDREN, child, idx);
+                       fnvlist_add_nvlist(spa->spa_load_info,
+                           ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
-                       for (int i = 0; i < idx; i++)
+                       for (uint64_t i = 0; i < idx; i++)
                                nvlist_free(child[i]);
                }
                nvlist_free(nv);
                kmem_free(child, rvd->vdev_children * sizeof (char **));
-       }
-
-       /*
-        * Compare the root vdev tree with the information we have
-        * from the MOS config (mrvd). Check each top-level vdev
-        * with the corresponding MOS config top-level (mtvd).
-        */
-       for (int c = 0; c < rvd->vdev_children; c++) {
-               vdev_t *tvd = rvd->vdev_child[c];
-               vdev_t *mtvd  = mrvd->vdev_child[c];
-
-               /*
-                * Resolve any "missing" vdevs in the current configuration.
-                * Also trust the MOS config about any "indirect" vdevs.
-                * If we find that the MOS config has more accurate information
-                * about the top-level vdev then use that vdev instead.
-                */
-               if ((tvd->vdev_ops == &vdev_missing_ops &&
-                   mtvd->vdev_ops != &vdev_missing_ops) ||
-                   (mtvd->vdev_ops == &vdev_indirect_ops &&
-                   tvd->vdev_ops != &vdev_indirect_ops)) {
 
-                       /*
-                        * Device specific actions.
-                        */
-                       if (mtvd->vdev_islog) {
-                               if (!(spa->spa_import_flags &
-                                   ZFS_IMPORT_MISSING_LOG)) {
-                                       continue;
-                               }
+               if (idx > 0) {
+                       spa_load_failed(spa, "some log devices are missing");
+                       vdev_dbgmsg_print_tree(rvd, 2);
+                       return (SET_ERROR(ENXIO));
+               }
+       } else {
+               for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+                       vdev_t *tvd = rvd->vdev_child[c];
 
+                       if (tvd->vdev_islog &&
+                           tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
                                spa_set_log_state(spa, SPA_LOG_CLEAR);
-                       } else if (mtvd->vdev_ops != &vdev_indirect_ops) {
-                               continue;
-                       }
-
-                       /*
-                        * Swap the missing vdev with the data we were
-                        * able to obtain from the MOS config.
-                        */
-                       vdev_remove_child(rvd, tvd);
-                       vdev_remove_child(mrvd, mtvd);
-
-                       vdev_add_child(rvd, mtvd);
-                       vdev_add_child(mrvd, tvd);
-
-                       vdev_reopen(rvd);
-               } else {
-                       if (mtvd->vdev_islog) {
-                               /*
-                                * Load the slog device's state from the MOS
-                                * config since it's possible that the label
-                                * does not contain the most up-to-date
-                                * information.
-                                */
-                               vdev_load_log_state(tvd, mtvd);
-                               vdev_reopen(tvd);
+                               spa_load_note(spa, "some log devices are "
+                                   "missing, ZIL is dropped.");
+                               vdev_dbgmsg_print_tree(rvd, 2);
+                               break;
                        }
-
-                       /*
-                        * Per-vdev ZAP info is stored exclusively in the MOS.
-                        */
-                       spa_config_valid_zaps(tvd, mtvd);
                }
-
-               /*
-                * Never trust this info from userland; always use what's
-                * in the MOS.  This prevents it from getting out of sync
-                * with the rest of the info in the MOS.
-                */
-               tvd->vdev_removing = mtvd->vdev_removing;
-               tvd->vdev_indirect_config = mtvd->vdev_indirect_config;
        }
 
-       vdev_free(mrvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-
-       /*
-        * Ensure we were able to validate the config.
-        */
-       return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
+       return (0);
 }
 
 /*
@@ -2094,13 +2148,13 @@ spa_load_verify(spa_t *spa)
 {
        zio_t *rio;
        spa_load_error_t sle = { 0 };
-       zpool_rewind_policy_t policy;
+       zpool_load_policy_t policy;
        boolean_t verify_ok = B_FALSE;
        int error = 0;
 
-       zpool_get_rewind_policy(spa->spa_config, &policy);
+       zpool_get_load_policy(spa->spa_config, &policy);
 
-       if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+       if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
                return (0);
 
        dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
@@ -2115,6 +2169,13 @@ spa_load_verify(spa_t *spa)
            ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
        if (spa_load_verify_metadata) {
+               if (spa->spa_extreme_rewind) {
+                       spa_load_note(spa, "performing a complete scan of the "
+                           "pool since extreme rewind is on. This may take "
+                           "a very long time.\n  (spa_load_verify_data=%u, "
+                           "spa_load_verify_metadata=%u)",
+                           spa_load_verify_data, spa_load_verify_metadata);
+               }
                error = traverse_pool(spa, spa->spa_verify_min_txg,
                    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
                    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
@@ -2125,8 +2186,15 @@ spa_load_verify(spa_t *spa)
        spa->spa_load_meta_errors = sle.sle_meta_count;
        spa->spa_load_data_errors = sle.sle_data_count;
 
-       if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
-           sle.sle_data_count <= policy.zrp_maxdata) {
+       if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
+               spa_load_note(spa, "spa_load_verify found %llu metadata errors "
+                   "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
+                   (u_longlong_t)sle.sle_data_count);
+       }
+
+       if (spa_load_verify_dryrun ||
+           (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
+           sle.sle_data_count <= policy.zlp_maxdata)) {
                int64_t loss = 0;
 
                verify_ok = B_TRUE;
@@ -2144,6 +2212,9 @@ spa_load_verify(spa_t *spa)
                spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
        }
 
+       if (spa_load_verify_dryrun)
+               return (0);
+
        if (error) {
                if (error != ENXIO && error != EIO)
                        error = SET_ERROR(EIO);
@@ -2167,10 +2238,17 @@ spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
  * Find a value in the pool directory object.
  */
 static int
-spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
 {
-       return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-           name, sizeof (uint64_t), 1, val));
+       int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           name, sizeof (uint64_t), 1, val);
+
+       if (error != 0 && (error != ENOENT || log_enoent)) {
+               spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
+                   "[error=%d]", name, error);
+       }
+
+       return (error);
 }
 
 static int
@@ -2180,6 +2258,21 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
        return (SET_ERROR(err));
 }
 
+static void
+spa_spawn_aux_threads(spa_t *spa)
+{
+       ASSERT(spa_writeable(spa));
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       spa_start_indirect_condensing_thread(spa);
+
+       ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+       spa->spa_checkpoint_discard_zthr =
+           zthr_create(spa_checkpoint_discard_thread_check,
+           spa_checkpoint_discard_thread, spa);
+}
+
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
@@ -2263,60 +2356,22 @@ spa_try_repair(spa_t *spa, nvlist_t *config)
 }
 
 static int
-spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
-    boolean_t mosconfig)
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
-       nvlist_t *config = spa->spa_config;
        char *ereport = FM_EREPORT_ZFS_POOL;
-       char *comment;
        int error;
-       uint64_t pool_guid;
-       nvlist_t *nvl;
-
-       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
-               return (SET_ERROR(EINVAL));
-
-       ASSERT(spa->spa_comment == NULL);
-       if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
-               spa->spa_comment = spa_strdup(comment);
-
-       /*
-        * Versioning wasn't explicitly added to the label until later, so if
-        * it's not present treat it as the initial version.
-        */
-       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-           &spa->spa_ubsync.ub_version) != 0)
-               spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
-
-       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-           &spa->spa_config_txg);
 
-       if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
-           spa_guid_exists(pool_guid, 0)) {
-               error = SET_ERROR(EEXIST);
-       } else {
-               spa->spa_config_guid = pool_guid;
-
-               if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
-                   &nvl) == 0) {
-                       VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
-                           KM_SLEEP) == 0);
-               }
-
-               nvlist_free(spa->spa_load_info);
-               spa->spa_load_info = fnvlist_alloc();
+       spa->spa_load_state = state;
 
-               gethrestime(&spa->spa_loaded_ts);
-               error = spa_load_impl(spa, pool_guid, config, state, type,
-                   mosconfig, &ereport);
-       }
+       gethrestime(&spa->spa_loaded_ts);
+       error = spa_load_impl(spa, type, &ereport);
 
        /*
         * Don't count references from objsets that are already closed
         * and are making their way through the eviction process.
         */
        spa_evicting_os_wait(spa);
-       spa->spa_minref = refcount_count(&spa->spa_refcount);
+       spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
        if (error) {
                if (error != EEXIST) {
                        spa->spa_loaded_ts.tv_sec = 0;
@@ -2562,44 +2617,104 @@ out:
        return (error);
 }
 
-/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
- */
-__attribute__((always_inline))
-static inline int
-spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
-    spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
-    char **ereport)
+static int
+spa_verify_host(spa_t *spa, nvlist_t *mos_config)
+{
+       uint64_t hostid;
+       char *hostname;
+       uint64_t myhostid = 0;
+
+       if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
+           ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+               hostname = fnvlist_lookup_string(mos_config,
+                   ZPOOL_CONFIG_HOSTNAME);
+
+               myhostid = zone_get_hostid(NULL);
+
+               if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
+                       cmn_err(CE_WARN, "pool '%s' could not be "
+                           "loaded as it was last accessed by "
+                           "another system (host: %s hostid: 0x%llx). "
+                           "See: http://illumos.org/msg/ZFS-8000-EY",
+                           spa_name(spa), hostname, (u_longlong_t)hostid);
+                       spa_load_failed(spa, "hostid verification failed: pool "
+                           "last accessed by host: %s (hostid: 0x%llx)",
+                           hostname, (u_longlong_t)hostid);
+                       return (SET_ERROR(EBADF));
+               }
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 {
        int error = 0;
-       nvlist_t *nvroot = NULL;
-       nvlist_t *label;
-       vdev_t *rvd;
-       uberblock_t *ub = &spa->spa_uberblock;
-       uint64_t children, config_cache_txg = spa->spa_config_txg;
-       int orig_mode = spa->spa_mode;
+       nvlist_t *nvtree, *nvl, *config = spa->spa_config;
        int parse;
-       uint64_t obj;
-       boolean_t missing_feat_write = B_FALSE;
-       boolean_t activity_check = B_FALSE;
+       vdev_t *rvd;
+       uint64_t pool_guid;
+       char *comment;
+
+       /*
+        * Versioning wasn't explicitly added to the label until later, so if
+        * it's not present treat it as the initial version.
+        */
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+           &spa->spa_ubsync.ub_version) != 0)
+               spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+               spa_load_failed(spa, "invalid config provided: '%s' missing",
+                   ZPOOL_CONFIG_POOL_GUID);
+               return (SET_ERROR(EINVAL));
+       }
 
        /*
-        * If this is an untrusted config, access the pool in read-only mode.
-        * This prevents things like resilvering recently removed devices.
+        * If we are doing an import, ensure that the pool is not already
+        * imported by checking if its pool guid already exists in the
+        * spa namespace.
+        *
+        * The only case that we allow an already imported pool to be
+        * imported again, is when the pool is checkpointed and we want to
+        * look at its checkpointed state from userland tools like zdb.
         */
-       if (!trust_config)
-               spa->spa_mode = FREAD;
+#ifdef _KERNEL
+       if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+           spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+           spa_guid_exists(pool_guid, 0)) {
+#else
+       if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+           spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+           spa_guid_exists(pool_guid, 0) &&
+           !spa_importing_readonly_checkpoint(spa)) {
+#endif
+               spa_load_failed(spa, "a pool with guid %llu is already open",
+                   (u_longlong_t)pool_guid);
+               return (SET_ERROR(EEXIST));
+       }
 
-       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       spa->spa_config_guid = pool_guid;
 
-       spa->spa_load_state = state;
+       nvlist_free(spa->spa_load_info);
+       spa->spa_load_info = fnvlist_alloc();
 
-       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
-               return (SET_ERROR(EINVAL));
+       ASSERT(spa->spa_comment == NULL);
+       if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+               spa->spa_comment = spa_strdup(comment);
 
-       parse = (type == SPA_IMPORT_EXISTING ?
-           VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+           &spa->spa_config_txg);
+
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
+               spa->spa_config_splitting = fnvlist_dup(nvl);
+
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
+               spa_load_failed(spa, "invalid config provided: '%s' missing",
+                   ZPOOL_CONFIG_VDEV_TREE);
+               return (SET_ERROR(EINVAL));
+       }
 
        /*
         * Create "The Godfather" zio to hold all async IOs
@@ -2618,11 +2733,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
         * configuration requires knowing the version number.
         */
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+       parse = (type == SPA_IMPORT_EXISTING ?
+           VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+       error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
        spa_config_exit(spa, SCL_ALL, FTAG);
 
-       if (error != 0)
+       if (error != 0) {
+               spa_load_failed(spa, "unable to parse config [error=%d]",
+                   error);
                return (error);
+       }
 
        ASSERT(spa->spa_root_vdev == rvd);
        ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
@@ -2632,59 +2752,171 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                ASSERT(spa_guid(spa) == pool_guid);
        }
 
-       /*
-        * Try to open all vdevs, loading each label in the process.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = vdev_open(rvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-       if (error != 0)
-               return (error);
+       return (0);
+}
+
+/*
+ * Recursively open all vdevs in the vdev tree. This function is called twice:
+ * first with the untrusted config, then with the trusted config.
+ */
+static int
+spa_ld_open_vdevs(spa_t *spa)
+{
+       int error = 0;
 
        /*
-        * We need to validate the vdev labels against the configuration that
-        * we have in hand, which is dependent on the setting of mosconfig. If
-        * mosconfig is true then we're validating the vdev labels based on
-        * that config.  Otherwise, we're validating against the cached config
-        * (zpool.cache) that was read when we loaded the zfs module, and then
-        * later we will recursively call spa_load() and validate against
-        * the vdev config.
-        *
-        * If we're assembling a new pool that's been split off from an
-        * existing pool, the labels haven't yet been updated so we skip
-        * validation for now.
+        * spa_missing_tvds_allowed defines how many top-level vdevs can be
+        * missing/unopenable for the root vdev to be still considered openable.
         */
-       if (type != SPA_IMPORT_ASSEMBLE) {
-               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-               error = vdev_validate(rvd, trust_config);
-               spa_config_exit(spa, SCL_ALL, FTAG);
-
-               if (error != 0)
-                       return (error);
-
-               if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-                       return (SET_ERROR(ENXIO));
+       if (spa->spa_trust_config) {
+               spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
+       } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
+               spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
+       } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
+               spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
+       } else {
+               spa->spa_missing_tvds_allowed = 0;
        }
 
-       /*
-        * Find the best uberblock.
-        */
-       vdev_uberblock_load(rvd, ub, &label);
+       spa->spa_missing_tvds_allowed =
+           MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
+
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       error = vdev_open(spa->spa_root_vdev);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+
+       if (spa->spa_missing_tvds != 0) {
+               spa_load_note(spa, "vdev tree has %lld missing top-level "
+                   "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
+               if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
+                       /*
+                        * Although theoretically we could allow users to open
+                        * incomplete pools in RW mode, we'd need to add a lot
+                        * of extra logic (e.g. adjust pool space to account
+                        * for missing vdevs).
+                        * This limitation also prevents users from accidentally
+                        * opening the pool in RW mode during data recovery and
+                        * damaging it further.
+                        */
+                       spa_load_note(spa, "pools with missing top-level "
+                           "vdevs can only be opened in read-only mode.");
+                       error = SET_ERROR(ENXIO);
+               } else {
+                       spa_load_note(spa, "current settings allow for maximum "
+                           "%lld missing top-level vdevs at this stage.",
+                           (u_longlong_t)spa->spa_missing_tvds_allowed);
+               }
+       }
+       if (error != 0) {
+               spa_load_failed(spa, "unable to open vdev tree [error=%d]",
+                   error);
+       }
+       if (spa->spa_missing_tvds != 0 || error != 0)
+               vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
+
+       return (error);
+}
+
+/*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand. This function is called twice: first with an untrusted
+ * config, then with a trusted config. The validation is more strict when the
+ * config is trusted.
+ */
+static int
+spa_ld_validate_vdevs(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       error = vdev_validate(rvd);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+
+       if (error != 0) {
+               spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
+               return (error);
+       }
+
+       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+               spa_load_failed(spa, "cannot open vdev tree after invalidating "
+                   "some vdevs");
+               vdev_dbgmsg_print_tree(rvd, 2);
+               return (SET_ERROR(ENXIO));
+       }
+
+       return (0);
+}
+
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+       spa->spa_state = POOL_STATE_ACTIVE;
+       spa->spa_ubsync = spa->spa_uberblock;
+       spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+           TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+       spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+           spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+       spa->spa_claim_max_txg = spa->spa_first_txg;
+       spa->spa_prev_software_version = ub->ub_software_version;
+}
+
+static int
+spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       nvlist_t *label;
+       uberblock_t *ub = &spa->spa_uberblock;
+       boolean_t activity_check = B_FALSE;
+
+       /*
+        * If we are opening the checkpointed state of the pool by
+        * rewinding to it, at this point we will have written the
+        * checkpointed uberblock to the vdev labels, so searching
+        * the labels will find the right uberblock.  However, if
+        * we are opening the checkpointed state read-only, we have
+        * not modified the labels. Therefore, we must ignore the
+        * labels and continue using the spa_uberblock that was set
+        * by spa_ld_checkpoint_rewind.
+        *
+        * Note that it would be fine to ignore the labels when
+        * rewinding (opening writeable) as well. However, if we
+        * crash just after writing the labels, we will end up
+        * searching the labels. Doing so in the common case means
+        * that this code path gets exercised normally, rather than
+        * just in the edge case.
+        */
+       if (ub->ub_checkpoint_txg != 0 &&
+           spa_importing_readonly_checkpoint(spa)) {
+               spa_ld_select_uberblock_done(spa, ub);
+               return (0);
+       }
+
+       /*
+        * Find the best uberblock.
+        */
+       vdev_uberblock_load(rvd, ub, &label);
 
        /*
         * If we weren't able to find a single valid uberblock, return failure.
         */
        if (ub->ub_txg == 0) {
                nvlist_free(label);
+               spa_load_failed(spa, "no valid uberblock found");
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
        }
 
+       spa_load_note(spa, "using uberblock with txg=%llu",
+           (u_longlong_t)ub->ub_txg);
+
+
        /*
         * For pools which have the multihost property on determine if the
         * pool is truly inactive and can be safely imported.  Prevent
         * hosts which don't have a hostid set from importing the pool.
         */
-       activity_check = spa_activity_check_required(spa, ub, label, config);
+       activity_check = spa_activity_check_required(spa, ub, label,
+           spa->spa_config);
        if (activity_check) {
                if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
                    spa_get_hostid() == 0) {
@@ -2694,7 +2926,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                        return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
                }
 
-               error = spa_activity_check(spa, ub, config);
+               int error = spa_activity_check(spa, ub, spa->spa_config);
                if (error) {
                        nvlist_free(label);
                        return (error);
@@ -2711,6 +2943,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
         */
        if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
                nvlist_free(label);
+               spa_load_failed(spa, "version %llu is not supported",
+                   (u_longlong_t)ub->ub_version);
                return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
        }
 
@@ -2721,9 +2955,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 * If we weren't able to find what's necessary for reading the
                 * MOS in the label, return failure.
                 */
-               if (label == NULL || nvlist_lookup_nvlist(label,
-                   ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
+               if (label == NULL) {
+                       spa_load_failed(spa, "label config unavailable");
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+                           ENXIO));
+               }
+
+               if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
+                   &features) != 0) {
                        nvlist_free(label);
+                       spa_load_failed(spa, "invalid label: '%s' missing",
+                           ZPOOL_CONFIG_FEATURES_FOR_READ);
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
                            ENXIO));
                }
@@ -2762,6 +3004,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                        VERIFY(nvlist_add_nvlist(spa->spa_load_info,
                            ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
                        nvlist_free(unsup_feat);
+                       spa_load_failed(spa, "some features are unsupported");
                        return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
                            ENOTSUP));
                }
@@ -2769,21 +3012,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                nvlist_free(unsup_feat);
        }
 
-       /*
-        * If the vdev guid sum doesn't match the uberblock, we have an
-        * incomplete configuration.  We first check to see if the pool
-        * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
-        * If it is, defer the vdev_guid_sum check till later so we
-        * can handle missing vdevs.
-        */
-       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
-           &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE &&
-           rvd->vdev_guid_sum != ub->ub_guid_sum)
-               return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
-
        if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
                spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-               spa_try_repair(spa, config);
+               spa_try_repair(spa, spa->spa_config);
                spa_config_exit(spa, SCL_ALL, FTAG);
                nvlist_free(spa->spa_config_splitting);
                spa->spa_config_splitting = NULL;
@@ -2792,81 +3023,245 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
        /*
         * Initialize internal SPA structures.
         */
-       spa->spa_state = POOL_STATE_ACTIVE;
-       spa->spa_ubsync = spa->spa_uberblock;
-       spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
-           TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
-       spa->spa_first_txg = spa->spa_last_ubsync_txg ?
-           spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
-       spa->spa_claim_max_txg = spa->spa_first_txg;
-       spa->spa_prev_software_version = ub->ub_software_version;
+       spa_ld_select_uberblock_done(spa, ub);
 
-       /*
-        * Everything that we read before we do spa_remove_init() must
-        * have been rewritten after the last device removal was initiated.
-        * Otherwise we could be reading from indirect vdevs before
-        * we have loaded their mappings.
-        */
+       return (0);
+}
+
+static int
+spa_ld_open_rootbp(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
 
        error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
-       if (error)
+       if (error != 0) {
+               spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
+                   "[error=%d]", error);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
        spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
-       if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+       return (0);
+}
+
+static int
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t reloading)
+{
+       vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+       nvlist_t *nv, *mos_config, *policy;
+       int error = 0, copy_error;
+       uint64_t healthy_tvds, healthy_tvds_mos;
+       uint64_t mos_config_txg;
+
+       if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
+           != 0)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        /*
-        * Validate the config, using the MOS config to fill in any
-        * information which might be missing.  If we fail to validate
-        * the config then declare the pool unfit for use. If we're
-        * assembling a pool from a split, the log is not transferred
-        * over.
+        * If we're assembling a pool from a split, the config provided is
+        * already trusted so there is nothing to do.
         */
-       if (type != SPA_IMPORT_ASSEMBLE) {
-               nvlist_t *mos_config;
-               if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
-                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       if (type == SPA_IMPORT_ASSEMBLE)
+               return (0);
+
+       healthy_tvds = spa_healthy_core_tvds(spa);
+
+       if (load_nvlist(spa, spa->spa_config_object, &mos_config)
+           != 0) {
+               spa_load_failed(spa, "unable to retrieve MOS config");
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
 
-               if (!spa_config_valid(spa, mos_config)) {
+       /*
+        * If we are doing an open, pool owner wasn't verified yet, thus do
+        * the verification here.
+        */
+       if (spa->spa_load_state == SPA_LOAD_OPEN) {
+               error = spa_verify_host(spa, mos_config);
+               if (error != 0) {
                        nvlist_free(mos_config);
-                       return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
-                           ENXIO));
+                       return (error);
                }
-               nvlist_free(mos_config);
+       }
+
+       nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
+
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+       /*
+        * Build a new vdev tree from the trusted config
+        */
+       VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+       /*
+        * Vdev paths in the MOS may be obsolete. If the untrusted config was
+        * obtained by scanning /dev/dsk, then it will have the right vdev
+        * paths. We update the trusted MOS config with this information.
+        * We first try to copy the paths with vdev_copy_path_strict, which
+        * succeeds only when both configs have exactly the same vdev tree.
+        * If that fails, we fall back to a more flexible method that has a
+        * best effort policy.
+        */
+       copy_error = vdev_copy_path_strict(rvd, mrvd);
+       if (copy_error != 0 || spa_load_print_vdev_tree) {
+               spa_load_note(spa, "provided vdev tree:");
+               vdev_dbgmsg_print_tree(rvd, 2);
+               spa_load_note(spa, "MOS vdev tree:");
+               vdev_dbgmsg_print_tree(mrvd, 2);
+       }
+       if (copy_error != 0) {
+               spa_load_note(spa, "vdev_copy_path_strict failed, falling "
+                   "back to vdev_copy_path_relaxed");
+               vdev_copy_path_relaxed(rvd, mrvd);
+       }
+
+       vdev_close(rvd);
+       vdev_free(rvd);
+       spa->spa_root_vdev = mrvd;
+       rvd = mrvd;
+       spa_config_exit(spa, SCL_ALL, FTAG);
+
+       /*
+        * We will use spa_config if we decide to reload the spa or if spa_load
+        * fails and we rewind. We must thus regenerate the config using the
+        * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
+        * pass settings on how to load the pool and is not stored in the MOS.
+        * We copy it over to our new, trusted config.
+        */
+       mos_config_txg = fnvlist_lookup_uint64(mos_config,
+           ZPOOL_CONFIG_POOL_TXG);
+       nvlist_free(mos_config);
+       mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
+       if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
+           &policy) == 0)
+               fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
+       spa_config_set(spa, mos_config);
+       spa->spa_config_source = SPA_CONFIG_SRC_MOS;
+
+       /*
+        * Now that we got the config from the MOS, we should be more strict
+        * in checking blkptrs and can make assumptions about the consistency
+        * of the vdev tree. spa_trust_config must be set to true before opening
+        * vdevs in order for them to be writeable.
+        */
+       spa->spa_trust_config = B_TRUE;
+
+       /*
+        * Open and validate the new vdev tree
+        */
+       error = spa_ld_open_vdevs(spa);
+       if (error != 0)
+               return (error);
+
+       error = spa_ld_validate_vdevs(spa);
+       if (error != 0)
+               return (error);
 
+       if (copy_error != 0 || spa_load_print_vdev_tree) {
+               spa_load_note(spa, "final vdev tree:");
+               vdev_dbgmsg_print_tree(rvd, 2);
+       }
+
+       if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
+           !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
                /*
-                * Now that we've validated the config, check the state of the
-                * root vdev.  If it can't be opened, it indicates one or
-                * more toplevel vdevs are faulted.
+                * Sanity check to make sure that we are indeed loading the
+                * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
+                * in the config provided and they happened to be the only ones
+                * to have the latest uberblock, we could involuntarily perform
+                * an extreme rewind.
                 */
-               if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-                       return (SET_ERROR(ENXIO));
+               healthy_tvds_mos = spa_healthy_core_tvds(spa);
+               if (healthy_tvds_mos - healthy_tvds >=
+                   SPA_SYNC_MIN_VDEVS) {
+                       spa_load_note(spa, "config provided misses too many "
+                           "top-level vdevs compared to MOS (%lld vs %lld). ",
+                           (u_longlong_t)healthy_tvds,
+                           (u_longlong_t)healthy_tvds_mos);
+                       spa_load_note(spa, "vdev tree:");
+                       vdev_dbgmsg_print_tree(rvd, 2);
+                       if (reloading) {
+                               spa_load_failed(spa, "config was already "
+                                   "provided from MOS. Aborting.");
+                               return (spa_vdev_err(rvd,
+                                   VDEV_AUX_CORRUPT_DATA, EIO));
+                       }
+                       spa_load_note(spa, "spa must be reloaded using MOS "
+                           "config");
+                       return (SET_ERROR(EAGAIN));
+               }
+       }
+
+       error = spa_check_for_missing_logs(spa);
+       if (error != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+       if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
+               spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
+                   "guid sum (%llu != %llu)",
+                   (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
+                   (u_longlong_t)rvd->vdev_guid_sum);
+               return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+                   ENXIO));
        }
 
+       return (0);
+}
+
+static int
+spa_ld_open_indirect_vdev_metadata(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
        /*
         * Everything that we read before spa_remove_init() must be stored
         * on concreted vdevs.  Therefore we do this as early as possible.
         */
-       if (spa_remove_init(spa) != 0)
+       error = spa_remove_init(spa);
+       if (error != 0) {
+               spa_load_failed(spa, "spa_remove_init failed [error=%d]",
+                   error);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
+
+       /*
+        * Retrieve information needed to condense indirect vdev mappings.
+        */
+       error = spa_condense_init(spa);
+       if (error != 0) {
+               spa_load_failed(spa, "spa_condense_init failed [error=%d]",
+                   error);
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
 
        if (spa_version(spa) >= SPA_VERSION_FEATURES) {
                boolean_t missing_feat_read = B_FALSE;
                nvlist_t *unsup_feat, *enabled_feat;
 
                if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
-                   &spa->spa_feat_for_read_obj) != 0) {
+                   &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
                }
 
                if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
-                   &spa->spa_feat_for_write_obj) != 0) {
+                   &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
                }
 
                if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
-                   &spa->spa_feat_desc_obj) != 0) {
+                   &spa->spa_feat_desc_obj, B_TRUE) != 0) {
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
                }
 
@@ -2877,10 +3272,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                    unsup_feat, enabled_feat))
                        missing_feat_read = B_TRUE;
 
-               if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
+               if (spa_writeable(spa) ||
+                   spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
                        if (!spa_features_check(spa, B_TRUE,
                            unsup_feat, enabled_feat)) {
-                               missing_feat_write = B_TRUE;
+                               *missing_feat_writep = B_TRUE;
                        }
                }
 
@@ -2919,8 +3315,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 * userland in order to know whether to display the
                 * abovementioned note.
                 */
-               if (missing_feat_read || (missing_feat_write &&
+               if (missing_feat_read || (*missing_feat_writep &&
                    spa_writeable(spa))) {
+                       spa_load_failed(spa, "pool uses unsupported features");
                        return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
                            ENOTSUP));
                }
@@ -2940,6 +3337,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                                spa->spa_feat_refcount_cache[i] =
                                    SPA_FEATURE_DISABLED;
                        } else {
+                               spa_load_failed(spa, "error getting refcount "
+                                   "for feature %s [error=%d]",
+                                   spa_feature_table[i].fi_guid, error);
                                return (spa_vdev_err(rvd,
                                    VDEV_AUX_CORRUPT_DATA, EIO));
                        }
@@ -2948,50 +3348,36 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 
        if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
                if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
-                   &spa->spa_feat_enabled_txg_obj) != 0)
+                   &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
        }
 
+       return (0);
+}
+
+static int
+spa_ld_load_special_directories(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
        spa->spa_is_initializing = B_TRUE;
        error = dsl_pool_open(spa->spa_dsl_pool);
        spa->spa_is_initializing = B_FALSE;
-       if (error != 0)
+       if (error != 0) {
+               spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
 
-       if (!trust_config) {
-               uint64_t hostid;
-               nvlist_t *policy = NULL;
-               nvlist_t *mos_config;
-
-               if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
-                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-               if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
-                   ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-                       char *hostname;
-                       unsigned long myhostid = 0;
-
-                       VERIFY(nvlist_lookup_string(mos_config,
-                           ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
-
-                       myhostid = spa_get_hostid();
-                       if (hostid && myhostid && hostid != myhostid) {
-                               nvlist_free(mos_config);
-                               return (SET_ERROR(EBADF));
-                       }
-               }
-               if (nvlist_lookup_nvlist(spa->spa_config,
-                   ZPOOL_REWIND_POLICY, &policy) == 0)
-                       VERIFY(nvlist_add_nvlist(mos_config,
-                           ZPOOL_REWIND_POLICY, policy) == 0);
-
-               spa_config_set(spa, mos_config);
-               spa_unload(spa);
-               spa_deactivate(spa);
-               spa_activate(spa, orig_mode);
+       return (0);
+}
 
-               return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
-       }
+static int
+spa_ld_get_props(spa_t *spa)
+{
+       int error = 0;
+       uint64_t obj;
+       vdev_t *rvd = spa->spa_root_vdev;
 
        /* Grab the checksum salt from the MOS. */
        error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -3003,26 +3389,31 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
                    sizeof (spa->spa_cksum_salt.zcs_bytes));
        } else if (error != 0) {
+               spa_load_failed(spa, "unable to retrieve checksum salt from "
+                   "MOS [error=%d]", error);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
        }
 
-       if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+       if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
        error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
-       if (error != 0)
+       if (error != 0) {
+               spa_load_failed(spa, "error opening deferred-frees bpobj "
+                   "[error=%d]", error);
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
 
        /*
         * Load the bit that tells us to use the new accounting function
         * (raid-z deflation).  If we have an older pool, this will not
         * be present.
         */
-       error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
+       error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
        if (error != 0 && error != ENOENT)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
-           &spa->spa_creation_version);
+           &spa->spa_creation_version, B_FALSE);
        if (error != 0 && error != ENOENT)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
@@ -3030,12 +3421,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
         * Load the persistent error log.  If we have an older pool, this will
         * not be present.
         */
-       error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
+       error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
+           B_FALSE);
        if (error != 0 && error != ENOENT)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
-           &spa->spa_errlog_scrub);
+           &spa->spa_errlog_scrub, B_FALSE);
        if (error != 0 && error != ENOENT)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
@@ -3043,7 +3435,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
         * Load the history object.  If we have an older pool, this
         * will not be present.
         */
-       error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
+       error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
        if (error != 0 && error != ENOENT)
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
@@ -3056,11 +3448,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 
        /* The sentinel is only available in the MOS config. */
        nvlist_t *mos_config;
-       if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
+       if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
+               spa_load_failed(spa, "unable to retrieve MOS config");
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
 
        error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
-           &spa->spa_all_vdev_zaps);
+           &spa->spa_all_vdev_zaps, B_FALSE);
 
        if (error == ENOENT) {
                VERIFY(!nvlist_exists(mos_config,
@@ -3082,199 +3476,721 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 */
                ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
        }
-       nvlist_free(mos_config);
+       nvlist_free(mos_config);
+
+       spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+
+       error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
+           B_FALSE);
+       if (error && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+       if (error == 0) {
+               uint64_t autoreplace;
+
+               spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+               spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+               spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+               spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+               spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+               spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
+               spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
+                   &spa->spa_dedup_ditto);
+
+               spa->spa_autoreplace = (autoreplace != 0);
+       }
+
+       /*
+        * If we are importing a pool with missing top-level vdevs,
+        * we enforce that the pool doesn't panic or get suspended on
+        * error since the likelihood of missing data is extremely high.
+        */
+       if (spa->spa_missing_tvds > 0 &&
+           spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
+           spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+               spa_load_note(spa, "forcing failmode to 'continue' "
+                   "as some top level vdevs are missing");
+               spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       /*
+        * If we're assembling the pool from the split-off vdevs of
+        * an existing pool, we don't want to attach the spares & cache
+        * devices.
+        */
+
+       /*
+        * Load any hot spares for this pool.
+        */
+       error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
+           B_FALSE);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+               ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
+               if (load_nvlist(spa, spa->spa_spares.sav_object,
+                   &spa->spa_spares.sav_config) != 0) {
+                       spa_load_failed(spa, "error loading spares nvlist");
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               }
+
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               spa_load_spares(spa);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+       } else if (error == 0) {
+               spa->spa_spares.sav_sync = B_TRUE;
+       }
+
+       /*
+        * Load any level 2 ARC devices for this pool.
+        */
+       error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
+           &spa->spa_l2cache.sav_object, B_FALSE);
+       if (error != 0 && error != ENOENT)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+               ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+               if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+                   &spa->spa_l2cache.sav_config) != 0) {
+                       spa_load_failed(spa, "error loading l2cache nvlist");
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               }
+
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               spa_load_l2cache(spa);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+       } else if (error == 0) {
+               spa->spa_l2cache.sav_sync = B_TRUE;
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_load_vdev_metadata(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       /*
+        * If the 'multihost' property is set, then never allow a pool to
+        * be imported when the system hostid is zero.  The exception to
+        * this rule is zdb which is always allowed to access pools.
+        */
+       if (spa_multihost(spa) && spa_get_hostid() == 0 &&
+           (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
+               fnvlist_add_uint64(spa->spa_load_info,
+                   ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+               return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+       }
+
+       /*
+        * If the 'autoreplace' property is set, then post a resource notifying
+        * the ZFS DE that it should not issue any faults for unopenable
+        * devices.  We also iterate over the vdevs, and post a sysevent for any
+        * unopenable vdevs so that the normal autoreplace handler can take
+        * over.
+        */
+       if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+               spa_check_removed(spa->spa_root_vdev);
+               /*
+                * For the import case, this is done in spa_import(), because
+                * at this point we're using the spare definitions from
+                * the MOS config, not necessarily from the userland config.
+                */
+               if (spa->spa_load_state != SPA_LOAD_IMPORT) {
+                       spa_aux_check_removed(&spa->spa_spares);
+                       spa_aux_check_removed(&spa->spa_l2cache);
+               }
+       }
+
+       /*
+        * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
+        */
+       error = vdev_load(rvd);
+       if (error != 0) {
+               spa_load_failed(spa, "vdev_load failed [error=%d]", error);
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+       }
+
+       /*
+        * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+
+       return (0);
+}
+
+static int
+spa_ld_load_dedup_tables(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       error = ddt_load(spa);
+       if (error != 0) {
+               spa_load_failed(spa, "ddt_load failed [error=%d]", error);
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
+               boolean_t missing = spa_check_logs(spa);
+               if (missing) {
+                       if (spa->spa_missing_tvds != 0) {
+                               spa_load_note(spa, "spa_check_logs failed "
+                                   "so dropping the logs");
+                       } else {
+                               *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+                               spa_load_failed(spa, "spa_check_logs failed");
+                               return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
+                                   ENXIO));
+                       }
+               }
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_verify_pool_data(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       /*
+        * We've successfully opened the pool, verify that we're ready
+        * to start pushing transactions.
+        */
+       if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+               error = spa_load_verify(spa);
+               if (error != 0) {
+                       spa_load_failed(spa, "spa_load_verify failed "
+                           "[error=%d]", error);
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+                           error));
+               }
+       }
+
+       return (0);
+}
+
+static void
+spa_ld_claim_log_blocks(spa_t *spa)
+{
+       dmu_tx_t *tx;
+       dsl_pool_t *dp = spa_get_dsl(spa);
+
+       /*
+        * Claim log blocks that haven't been committed yet.
+        * This must all happen in a single txg.
+        * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+        * invoked from zil_claim_log_block()'s i/o done callback.
+        * Price of rollback is that we abandon the log.
+        */
+       spa->spa_claiming = B_TRUE;
+
+       tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+       (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+           zil_claim, tx, DS_FIND_CHILDREN);
+       dmu_tx_commit(tx);
+
+       spa->spa_claiming = B_FALSE;
+
+       spa_set_log_state(spa, SPA_LOG_GOOD);
+}
+
+static void
+spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
+    boolean_t update_config_cache)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       int need_update = B_FALSE;
+
+       /*
+        * If the config cache is stale, or we have uninitialized
+        * metaslabs (see spa_vdev_add()), then update the config.
+        *
+        * If this is a verbatim import, trust the current
+        * in-core spa_config and update the disk labels.
+        */
+       if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
+           spa->spa_load_state == SPA_LOAD_IMPORT ||
+           spa->spa_load_state == SPA_LOAD_RECOVER ||
+           (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
+               need_update = B_TRUE;
+
+       for (int c = 0; c < rvd->vdev_children; c++)
+               if (rvd->vdev_child[c]->vdev_ms_array == 0)
+                       need_update = B_TRUE;
+
+       /*
+        * Update the config cache asychronously in case we're the
+        * root pool, in which case the config cache isn't writable yet.
+        */
+       if (need_update)
+               spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+static void
+spa_ld_prepare_for_reload(spa_t *spa)
+{
+       int mode = spa->spa_mode;
+       int async_suspended = spa->spa_async_suspended;
+
+       spa_unload(spa);
+       spa_deactivate(spa);
+       spa_activate(spa, mode);
+
+       /*
+        * We save the value of spa_async_suspended as it gets reset to 0 by
+        * spa_unload(). We want to restore it back to the original value before
+        * returning as we might be calling spa_async_resume() later.
+        */
+       spa->spa_async_suspended = async_suspended;
+}
+
+static int
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+       uberblock_t checkpoint;
+       int error = 0;
+
+       ASSERT0(spa->spa_checkpoint_txg);
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+           sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+       if (error == ENOENT)
+               return (0);
+
+       if (error != 0)
+               return (error);
+
+       ASSERT3U(checkpoint.ub_txg, !=, 0);
+       ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+       ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+       spa->spa_checkpoint_txg = checkpoint.ub_txg;
+       spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+       return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
+{
+       int error = 0;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+       /*
+        * Never trust the config that is provided unless we are assembling
+        * a pool following a split.
+        * This means don't trust blkptrs and the vdev tree in general. This
+        * also effectively puts the spa in read-only mode since
+        * spa_writeable() checks for spa_trust_config to be true.
+        * We will later load a trusted config from the MOS.
+        */
+       if (type != SPA_IMPORT_ASSEMBLE)
+               spa->spa_trust_config = B_FALSE;
+
+       /*
+        * Parse the config provided to create a vdev tree.
+        */
+       error = spa_ld_parse_config(spa, type);
+       if (error != 0)
+               return (error);
+
+       /*
+        * Now that we have the vdev tree, try to open each vdev. This involves
+        * opening the underlying physical device, retrieving its geometry and
+        * probing the vdev with a dummy I/O. The state of each vdev will be set
+        * based on the success of those operations. After this we'll be ready
+        * to read from the vdevs.
+        */
+       error = spa_ld_open_vdevs(spa);
+       if (error != 0)
+               return (error);
+
+       /*
+        * Read the label of each vdev and make sure that the GUIDs stored
+        * there match the GUIDs in the config provided.
+        * If we're assembling a new pool that's been split off from an
+        * existing pool, the labels haven't yet been updated so we skip
+        * validation for now.
+        */
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               error = spa_ld_validate_vdevs(spa);
+               if (error != 0)
+                       return (error);
+       }
+
+       /*
+        * Read all vdev labels to find the best uberblock (i.e. latest,
+        * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+        * get the list of features required to read blkptrs in the MOS from
+        * the vdev label with the best uberblock and verify that our version
+        * of zfs supports them all.
+        */
+       error = spa_ld_select_uberblock(spa, type);
+       if (error != 0)
+               return (error);
+
+       /*
+        * Pass that uberblock to the dsl_pool layer which will open the root
+        * blkptr. This blkptr points to the latest version of the MOS and will
+        * allow us to read its contents.
+        */
+       error = spa_ld_open_rootbp(spa);
+       if (error != 0)
+               return (error);
+
+       return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+       uberblock_t checkpoint;
+       int error = 0;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+           sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+       if (error != 0) {
+               spa_load_failed(spa, "unable to retrieve checkpointed "
+                   "uberblock from the MOS config [error=%d]", error);
+
+               if (error == ENOENT)
+                       error = ZFS_ERR_NO_CHECKPOINT;
+
+               return (error);
+       }
+
+       ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+       ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+       /*
+        * We need to update the txg and timestamp of the checkpointed
+        * uberblock to be higher than the latest one. This ensures that
+        * the checkpointed uberblock is selected if we were to close and
+        * reopen the pool right after we've written it in the vdev labels.
+        * (also see block comment in vdev_uberblock_compare)
+        */
+       checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+       checkpoint.ub_timestamp = gethrestime_sec();
+
+       /*
+        * Set current uberblock to be the checkpointed uberblock.
+        */
+       spa->spa_uberblock = checkpoint;
+
+       /*
+        * If we are doing a normal rewind, then the pool is open for
+        * writing and we sync the "updated" checkpointed uberblock to
+        * disk. Once this is done, we've basically rewound the whole
+        * pool and there is no way back.
+        *
+        * There are cases when we don't want to attempt and sync the
+        * checkpointed uberblock to disk because we are opening a
+        * pool as read-only. Specifically, verifying the checkpointed
+        * state with zdb, and importing the checkpointed state to get
+        * a "preview" of its content.
+        */
+       if (spa_writeable(spa)) {
+               vdev_t *rvd = spa->spa_root_vdev;
+
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+               int svdcount = 0;
+               int children = rvd->vdev_children;
+               int c0 = spa_get_random(children);
+
+               for (int c = 0; c < children; c++) {
+                       vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+                       /* Stop when revisiting the first vdev */
+                       if (c > 0 && svd[0] == vd)
+                               break;
+
+                       if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+                           !vdev_is_concrete(vd))
+                               continue;
+
+                       svd[svdcount++] = vd;
+                       if (svdcount == SPA_SYNC_MIN_VDEVS)
+                               break;
+               }
+               error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+               if (error == 0)
+                       spa->spa_last_synced_guid = rvd->vdev_guid;
+               spa_config_exit(spa, SCL_ALL, FTAG);
+
+               if (error != 0) {
+                       spa_load_failed(spa, "failed to write checkpointed "
+                           "uberblock to the vdev labels [error=%d]", error);
+                       return (error);
+               }
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t *update_config_cache)
+{
+       int error;
 
        /*
-        * If we're assembling the pool from the split-off vdevs of
-        * an existing pool, we don't want to attach the spares & cache
-        * devices.
+        * Parse the config for pool, open and validate vdevs,
+        * select an uberblock, and use that uberblock to open
+        * the MOS.
         */
+       error = spa_ld_mos_init(spa, type);
+       if (error != 0)
+               return (error);
 
        /*
-        * Load any hot spares for this pool.
+        * Retrieve the trusted config stored in the MOS and use it to create
+        * a new, exact version of the vdev tree, then reopen all vdevs.
         */
-       error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
-       if (error != 0 && error != ENOENT)
-               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-       if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
-               ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
-               if (load_nvlist(spa, spa->spa_spares.sav_object,
-                   &spa->spa_spares.sav_config) != 0)
-                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       error = spa_ld_trusted_config(spa, type, B_FALSE);
+       if (error == EAGAIN) {
+               if (update_config_cache != NULL)
+                       *update_config_cache = B_TRUE;
 
-               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-               spa_load_spares(spa);
-               spa_config_exit(spa, SCL_ALL, FTAG);
-       } else if (error == 0) {
-               spa->spa_spares.sav_sync = B_TRUE;
-       }
+               /*
+                * Redo the loading process with the trusted config if it is
+                * too different from the untrusted config.
+                */
+               spa_ld_prepare_for_reload(spa);
+               spa_load_note(spa, "RELOADING");
+               error = spa_ld_mos_init(spa, type);
+               if (error != 0)
+                       return (error);
 
-       /*
-        * Load any level 2 ARC devices for this pool.
-        */
-       error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
-           &spa->spa_l2cache.sav_object);
-       if (error != 0 && error != ENOENT)
-               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-       if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
-               ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
-               if (load_nvlist(spa, spa->spa_l2cache.sav_object,
-                   &spa->spa_l2cache.sav_config) != 0)
-                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               error = spa_ld_trusted_config(spa, type, B_TRUE);
+               if (error != 0)
+                       return (error);
 
-               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-               spa_load_l2cache(spa);
-               spa_config_exit(spa, SCL_ALL, FTAG);
-       } else if (error == 0) {
-               spa->spa_l2cache.sav_sync = B_TRUE;
+       } else if (error != 0) {
+               return (error);
        }
 
-       spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+       return (0);
+}
 
-       error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
-       if (error && error != ENOENT)
-               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+       int error = 0;
+       boolean_t missing_feat_write = B_FALSE;
+       boolean_t checkpoint_rewind =
+           (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+       boolean_t update_config_cache = B_FALSE;
 
-       if (error == 0) {
-               uint64_t autoreplace = 0;
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
-               spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
-               spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
-               spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
-               spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
-               spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
-               spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
-               spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
-                   &spa->spa_dedup_ditto);
+       spa_load_note(spa, "LOADING");
 
-               spa->spa_autoreplace = (autoreplace != 0);
-       }
+       error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+       if (error != 0)
+               return (error);
 
        /*
-        * If the 'multihost' property is set, then never allow a pool to
-        * be imported when the system hostid is zero.  The exception to
-        * this rule is zdb which is always allowed to access pools.
+        * If we are rewinding to the checkpoint then we need to repeat
+        * everything we've done so far in this function but this time
+        * selecting the checkpointed uberblock and using that to open
+        * the MOS.
         */
-       if (spa_multihost(spa) && spa_get_hostid() == 0 &&
-           (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
-               fnvlist_add_uint64(spa->spa_load_info,
-                   ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
-               return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
-       }
+       if (checkpoint_rewind) {
+               /*
+                * If we are rewinding to the checkpoint update config cache
+                * anyway.
+                */
+               update_config_cache = B_TRUE;
 
-       /*
-        * If the 'autoreplace' property is set, then post a resource notifying
-        * the ZFS DE that it should not issue any faults for unopenable
-        * devices.  We also iterate over the vdevs, and post a sysevent for any
-        * unopenable vdevs so that the normal autoreplace handler can take
-        * over.
-        */
-       if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
-               spa_check_removed(spa->spa_root_vdev);
                /*
-                * For the import case, this is done in spa_import(), because
-                * at this point we're using the spare definitions from
-                * the MOS config, not necessarily from the userland config.
+                * Extract the checkpointed uberblock from the current MOS
+                * and use this as the pool's uberblock from now on. If the
+                * pool is imported as writeable we also write the checkpoint
+                * uberblock to the labels, making the rewind permanent.
                 */
-               if (state != SPA_LOAD_IMPORT) {
-                       spa_aux_check_removed(&spa->spa_spares);
-                       spa_aux_check_removed(&spa->spa_l2cache);
-               }
+               error = spa_ld_checkpoint_rewind(spa);
+               if (error != 0)
+                       return (error);
+
+               /*
+                * Redo the loading process process again with the
+                * checkpointed uberblock.
+                */
+               spa_ld_prepare_for_reload(spa);
+               spa_load_note(spa, "LOADING checkpointed uberblock");
+               error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+               if (error != 0)
+                       return (error);
        }
 
        /*
-        * Load the vdev state for all toplevel vdevs.
+        * Retrieve the checkpoint txg if the pool has a checkpoint.
         */
-       error = vdev_load(rvd);
-       if (error != 0) {
-               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
-       }
+       error = spa_ld_read_checkpoint_txg(spa);
+       if (error != 0)
+               return (error);
 
-       error = spa_condense_init(spa);
-       if (error != 0) {
-               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
-       }
+       /*
+        * Retrieve the mapping of indirect vdevs. Those vdevs were removed
+        * from the pool and their contents were re-mapped to other vdevs. Note
+        * that everything that we read before this step must have been
+        * rewritten on concrete vdevs after the last device removal was
+        * initiated. Otherwise we could be reading from indirect vdevs before
+        * we have loaded their mappings.
+        */
+       error = spa_ld_open_indirect_vdev_metadata(spa);
+       if (error != 0)
+               return (error);
 
        /*
-        * Propagate the leaf DTLs we just loaded all the way up the tree.
+        * Retrieve the full list of active features from the MOS and check if
+        * they are all supported.
         */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-       spa_config_exit(spa, SCL_ALL, FTAG);
+       error = spa_ld_check_features(spa, &missing_feat_write);
+       if (error != 0)
+               return (error);
 
        /*
-        * Load the DDTs (dedup tables).
+        * Load several special directories from the MOS needed by the dsl_pool
+        * layer.
         */
-       error = ddt_load(spa);
+       error = spa_ld_load_special_directories(spa);
        if (error != 0)
-               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               return (error);
 
-       spa_update_dspace(spa);
+       /*
+        * Retrieve pool properties from the MOS.
+        */
+       error = spa_ld_get_props(spa);
+       if (error != 0)
+               return (error);
 
-       if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa) &&
-           spa_check_logs(spa)) {
-               *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
-               return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
-       }
+       /*
+        * Retrieve the list of auxiliary devices - cache devices and spares -
+        * and open them.
+        */
+       error = spa_ld_open_aux_vdevs(spa, type);
+       if (error != 0)
+               return (error);
+
+       /*
+        * Load the metadata for all vdevs. Also check if unopenable devices
+        * should be autoreplaced.
+        */
+       error = spa_ld_load_vdev_metadata(spa);
+       if (error != 0)
+               return (error);
+
+       error = spa_ld_load_dedup_tables(spa);
+       if (error != 0)
+               return (error);
+
+       /*
+        * Verify the logs now to make sure we don't have any unexpected errors
+        * when we claim log blocks later.
+        */
+       error = spa_ld_verify_logs(spa, type, ereport);
+       if (error != 0)
+               return (error);
 
        if (missing_feat_write) {
-               ASSERT(state == SPA_LOAD_TRYIMPORT);
+               ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 
                /*
                 * At this point, we know that we can open the pool in
                 * read-only mode but not read-write mode. We now have enough
                 * information and can return to userland.
                 */
-               return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
+               return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+                   ENOTSUP));
        }
 
        /*
-        * We've successfully opened the pool, verify that we're ready
-        * to start pushing transactions.
+        * Traverse the last txgs to make sure the pool was left off in a safe
+        * state. When performing an extreme rewind, we verify the whole pool,
+        * which can take a very long time.
         */
-       if (state != SPA_LOAD_TRYIMPORT) {
-               if ((error = spa_load_verify(spa)))
-                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
-                           error));
-       }
+       error = spa_ld_verify_pool_data(spa);
+       if (error != 0)
+               return (error);
 
-       if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+       /*
+        * Calculate the deflated space for the pool. This must be done before
+        * we write anything to the pool because we'd need to update the space
+        * accounting using the deflated sizes.
+        */
+       spa_update_dspace(spa);
+
+       /*
+        * We have now retrieved all the information we needed to open the
+        * pool. If we are importing the pool in read-write mode, a few
+        * additional steps must be performed to finish the import.
+        */
+       if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
            spa->spa_load_max_txg == UINT64_MAX)) {
-               dmu_tx_t *tx;
-               int need_update = B_FALSE;
-               dsl_pool_t *dp = spa_get_dsl(spa);
+               uint64_t config_cache_txg = spa->spa_config_txg;
+
+               ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
                /*
-                * We must check this before we start the sync thread, because
-                * we only want to start a condense thread for condense
-                * operations that were in progress when the pool was
-                * imported.  Once we start syncing, spa_sync() could
-                * initiate a condense (and start a thread for it).  In
-                * that case it would be wrong to start a second
-                * condense thread.
+                * In case of a checkpoint rewind, log the original txg
+                * of the checkpointed uberblock.
                 */
-               boolean_t condense_in_progress =
-                   (spa->spa_condensing_indirect != NULL);
-
-               ASSERT(state != SPA_LOAD_TRYIMPORT);
+               if (checkpoint_rewind) {
+                       spa_history_log_internal(spa, "checkpoint rewind",
+                           NULL, "rewound state to txg=%llu",
+                           (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+               }
 
                /*
-                * Claim log blocks that haven't been committed yet.
-                * This must all happen in a single txg.
-                * Note: spa_claim_max_txg is updated by spa_claim_notify(),
-                * invoked from zil_claim_log_block()'s i/o done callback.
-                * Price of rollback is that we abandon the log.
+                * Traverse the ZIL and claim all blocks.
                 */
-               spa->spa_claiming = B_TRUE;
-
-               tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
-               (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-                   zil_claim, tx, DS_FIND_CHILDREN);
-               dmu_tx_commit(tx);
-
-               spa->spa_claiming = B_FALSE;
+               spa_ld_claim_log_blocks(spa);
 
-               spa_set_log_state(spa, SPA_LOG_GOOD);
+               /*
+                * Kick-off the syncing thread.
+                */
                spa->spa_sync_on = B_TRUE;
                txg_sync_start(spa->spa_dsl_pool);
                mmp_thread_start(spa);
@@ -3283,40 +4199,24 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 * Wait for all claims to sync.  We sync up to the highest
                 * claimed log block birth time so that claimed log blocks
                 * don't appear to be from the future.  spa_claim_max_txg
-                * will have been set for us by either zil_check_log_chain()
-                * (invoked from spa_check_logs()) or zil_claim() above.
+                * will have been set for us by ZIL traversal operations
+                * performed above.
                 */
                txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
                /*
-                * If the config cache is stale, or we have uninitialized
-                * metaslabs (see spa_vdev_add()), then update the config.
-                *
-                * If this is a verbatim import, trust the current
-                * in-core spa_config and update the disk labels.
-                */
-               if (config_cache_txg != spa->spa_config_txg ||
-                   state == SPA_LOAD_IMPORT ||
-                   state == SPA_LOAD_RECOVER ||
-                   (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
-                       need_update = B_TRUE;
-
-               for (int c = 0; c < rvd->vdev_children; c++)
-                       if (rvd->vdev_child[c]->vdev_ms_array == 0)
-                               need_update = B_TRUE;
-
-               /*
-                * Update the config cache asychronously in case we're the
-                * root pool, in which case the config cache isn't writable yet.
+                * Check if we need to request an update of the config. On the
+                * next sync, we would update the config stored in vdev labels
+                * and the cachefile (by default /etc/zfs/zpool.cache).
                 */
-               if (need_update)
-                       spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+               spa_ld_check_for_config_update(spa, config_cache_txg,
+                   update_config_cache);
 
                /*
                 * Check all DTLs to see if anything needs resilvering.
                 */
                if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
-                   vdev_resilver_needed(rvd, NULL, NULL))
+                   vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
                        spa_async_request(spa, SPA_ASYNC_RESILVER);
 
                /*
@@ -3336,22 +4236,18 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 */
                dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
-               /*
-                * Note: unlike condensing, we don't need an analogous
-                * "removal_in_progress" dance because no other thread
-                * can start a removal while we hold the spa_namespace_lock.
-                */
                spa_restart_removal(spa);
 
-               if (condense_in_progress)
-                       spa_condense_indirect_restart(spa);
+               spa_spawn_aux_threads(spa);
        }
 
+       spa_load_note(spa, "LOADED");
+
        return (0);
 }
 
 static int
-spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
        int mode = spa->spa_mode;
 
@@ -3363,7 +4259,10 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
        spa_activate(spa, mode);
        spa_async_suspend(spa);
 
-       return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+       spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
+           (u_longlong_t)spa->spa_load_max_txg);
+
+       return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 }
 
 /*
@@ -3374,8 +4273,8 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
  * spa_load().
  */
 static int
-spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
-    uint64_t max_request, int rewind_flags)
+spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
+    int rewind_flags)
 {
        nvlist_t *loadinfo = NULL;
        nvlist_t *config = NULL;
@@ -3392,10 +4291,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
                        spa->spa_extreme_rewind = B_TRUE;
        }
 
-       load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
-           mosconfig);
+       load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
        if (load_error == 0)
                return (0);
+       if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+               /*
+                * When attempting checkpoint-rewind on a pool with no
+                * checkpoint, we should not attempt to load uberblocks
+                * from previous txgs when spa_load fails.
+                */
+               ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+               return (load_error);
+       }
 
        if (spa->spa_root_vdev != NULL)
                config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
@@ -3434,7 +4341,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
            spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
                if (spa->spa_load_max_txg < safe_rewind_txg)
                        spa->spa_extreme_rewind = B_TRUE;
-               rewind_error = spa_load_retry(spa, state, mosconfig);
+               rewind_error = spa_load_retry(spa, state);
        }
 
        spa->spa_extreme_rewind = B_FALSE;
@@ -3503,22 +4410,24 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
        }
 
        if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
-               zpool_rewind_policy_t policy;
+               zpool_load_policy_t policy;
 
                firstopen = B_TRUE;
 
-               zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
+               zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
                    &policy);
-               if (policy.zrp_request & ZPOOL_DO_REWIND)
+               if (policy.zlp_rewind & ZPOOL_DO_REWIND)
                        state = SPA_LOAD_RECOVER;
 
                spa_activate(spa, spa_mode_global);
 
                if (state != SPA_LOAD_RECOVER)
                        spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+               spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 
-               error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
-                   policy.zrp_request);
+               zfs_dbgmsg("spa_open_common: opening %s", pool);
+               error = spa_load_best(spa, state, policy.zlp_txg,
+                   policy.zlp_rewind);
 
                if (error == EBADF) {
                        /*
@@ -4066,7 +4975,7 @@ spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
            !has_encryption)
                return (SET_ERROR(ENOTSUP));
 
-       return (dmu_objset_create_crypt_check(NULL, dcp));
+       return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
 }
 
 /*
@@ -4093,7 +5002,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        char *poolname;
        nvlist_t *nvl;
 
-       if (nvlist_lookup_string(props, "tname", &poolname) != 0)
+       if (props == NULL ||
+           nvlist_lookup_string(props, "tname", &poolname) != 0)
                poolname = (char *)pool;
 
        /*
@@ -4197,9 +5107,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
            (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
            (error = spa_validate_aux(spa, nvroot, txg,
            VDEV_ALLOC_ADD)) == 0) {
-               for (int c = 0; c < rvd->vdev_children; c++) {
-                       vdev_metaslab_set_size(rvd->vdev_child[c]);
-                       vdev_expand(rvd->vdev_child[c], txg);
+               /*
+                * instantiate the metaslab groups (this will dirty the vdevs)
+                * we can no longer error exit past this point
+                */
+               for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+                       vdev_t *vd = rvd->vdev_child[c];
+
+                       vdev_metaslab_set_size(vd);
+                       vdev_expand(vd, txg);
                }
        }
 
@@ -4353,6 +5269,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        if (dp->dp_root_dir->dd_crypto_obj != 0)
                VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG));
 
+       spa_spawn_aux_threads(spa);
+
        spa_write_cachefile(spa, B_FALSE, B_TRUE);
 
        /*
@@ -4360,7 +5278,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
         * and are making their way through the eviction process.
         */
        spa_evicting_os_wait(spa);
-       spa->spa_minref = refcount_count(&spa->spa_refcount);
+       spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
        spa->spa_load_state = SPA_LOAD_NONE;
 
        mutex_exit(&spa_namespace_lock);
@@ -4377,7 +5295,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
        spa_t *spa;
        char *altroot = NULL;
        spa_load_state_t state = SPA_LOAD_IMPORT;
-       zpool_rewind_policy_t policy;
+       zpool_load_policy_t policy;
        uint64_t mode = spa_mode_global;
        uint64_t readonly = B_FALSE;
        int error;
@@ -4416,7 +5334,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 
                spa_write_cachefile(spa, B_FALSE, B_TRUE);
                spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
-
+               zfs_dbgmsg("spa_import: verbatim import of %s", pool);
                mutex_exit(&spa_namespace_lock);
                return (0);
        }
@@ -4428,20 +5346,20 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
         */
        spa_async_suspend(spa);
 
-       zpool_get_rewind_policy(config, &policy);
-       if (policy.zrp_request & ZPOOL_DO_REWIND)
+       zpool_get_load_policy(config, &policy);
+       if (policy.zlp_rewind & ZPOOL_DO_REWIND)
                state = SPA_LOAD_RECOVER;
 
-       /*
-        * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
-        * because the user-supplied config is actually the one to trust when
-        * doing an import.
-        */
-       if (state != SPA_LOAD_RECOVER)
-               spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+       spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 
-       error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
-           policy.zrp_request);
+       if (state != SPA_LOAD_RECOVER) {
+               spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+               zfs_dbgmsg("spa_import: importing %s", pool);
+       } else {
+               zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
+                   "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
+       }
+       error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
 
        /*
         * Propagate anything learned while loading the pool and pass it
@@ -4555,10 +5473,11 @@ nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
        nvlist_t *config = NULL;
-       char *poolname;
+       char *poolname, *cachefile;
        spa_t *spa;
        uint64_t state;
        int error;
+       zpool_load_policy_t policy;
 
        if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
                return (NULL);
@@ -4574,11 +5493,27 @@ spa_tryimport(nvlist_t *tryconfig)
        spa_activate(spa, FREAD);
 
        /*
-        * Pass off the heavy lifting to spa_load().
-        * Pass TRUE for mosconfig because the user-supplied config
-        * is actually the one to trust when doing an import.
+        * Rewind pool if a max txg was provided.
         */
-       error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
+       zpool_get_load_policy(spa->spa_config, &policy);
+       if (policy.zlp_txg != UINT64_MAX) {
+               spa->spa_load_max_txg = policy.zlp_txg;
+               spa->spa_extreme_rewind = B_TRUE;
+               zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
+                   poolname, (longlong_t)policy.zlp_txg);
+       } else {
+               zfs_dbgmsg("spa_tryimport: importing %s", poolname);
+       }
+
+       if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
+           == 0) {
+               zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
+               spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+       } else {
+               spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
+       }
+
+       error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
        /*
         * If 'tryconfig' was at least parsable, return the current config.
@@ -4860,8 +5795,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
                for (int c = 0; c < vd->vdev_children; c++) {
                        tvd = vd->vdev_child[c];
                        if (spa->spa_vdev_removal != NULL &&
-                           tvd->vdev_ashift !=
-                           spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+                           tvd->vdev_ashift != spa->spa_max_ashift) {
                                return (spa_vdev_exit(spa, vd, txg, EINVAL));
                        }
                        /* Fail if top level vdev is raidz */
@@ -4970,11 +5904,16 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
        oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
-       if (spa->spa_vdev_removal != NULL ||
-           spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
-               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (spa_vdev_exit(spa, NULL, txg, error));
        }
 
+       if (spa->spa_vdev_removal != NULL)
+               return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
        if (oldvd == NULL)
                return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
@@ -5184,6 +6123,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
+       /*
+        * Besides being called directly from the userland through the
+        * ioctl interface, spa_vdev_detach() can be potentially called
+        * at the end of spa_vdev_resilver_done().
+        *
+        * In the regular case, when we have a checkpoint this shouldn't
+        * happen as we never empty the DTLs of a vdev during the scrub
+        * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+        * should never get here when we have a checkpoint.
+        *
+        * That said, even in a case when we checkpoint the pool exactly
+        * as spa_vdev_resilver_done() calls this function everything
+        * should be fine as the resilver will return right away.
+        */
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+
        if (vd == NULL)
                return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
@@ -5422,6 +6382,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 
        txg = spa_vdev_enter(spa);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+
        /* clear the log and flush everything up to now */
        activate_slog = spa_passivate_log(spa);
        (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
@@ -5517,7 +6484,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
                        break;
                }
 
-               if (vdev_dtl_required(vml[c])) {
+               if (vdev_dtl_required(vml[c]) ||
+                   vdev_resilver_needed(vml[c], NULL, NULL)) {
                        error = SET_ERROR(EBUSY);
                        break;
                }
@@ -5601,8 +6569,10 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
        spa_activate(newspa, spa_mode_global);
        spa_async_suspend(newspa);
 
+       newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+
        /* create the new pool from the disks of the original pool */
-       error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+       error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
        if (error)
                goto out;
 
@@ -5992,8 +6962,14 @@ spa_async_thread(void *arg)
 
                mutex_enter(&spa_namespace_lock);
                old_space = metaslab_class_get_space(spa_normal_class(spa));
+               old_space += metaslab_class_get_space(spa_special_class(spa));
+               old_space += metaslab_class_get_space(spa_dedup_class(spa));
+
                spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
                new_space = metaslab_class_get_space(spa_normal_class(spa));
+               new_space += metaslab_class_get_space(spa_special_class(spa));
+               new_space += metaslab_class_get_space(spa_dedup_class(spa));
                mutex_exit(&spa_namespace_lock);
 
                /*
@@ -6062,12 +7038,19 @@ spa_async_suspend(spa_t *spa)
 {
        mutex_enter(&spa->spa_async_lock);
        spa->spa_async_suspended++;
-       while (spa->spa_async_thread != NULL ||
-           spa->spa_condense_thread != NULL)
+       while (spa->spa_async_thread != NULL)
                cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
        mutex_exit(&spa->spa_async_lock);
 
        spa_vdev_remove_suspend(spa);
+
+       zthr_t *condense_thread = spa->spa_condense_zthr;
+       if (condense_thread != NULL && zthr_isrunning(condense_thread))
+               VERIFY0(zthr_cancel(condense_thread));
+
+       zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+       if (discard_thread != NULL && zthr_isrunning(discard_thread))
+               VERIFY0(zthr_cancel(discard_thread));
 }
 
 void
@@ -6078,6 +7061,14 @@ spa_async_resume(spa_t *spa)
        spa->spa_async_suspended--;
        mutex_exit(&spa->spa_async_lock);
        spa_restart_removal(spa);
+
+       zthr_t *condense_thread = spa->spa_condense_zthr;
+       if (condense_thread != NULL && !zthr_isrunning(condense_thread))
+               zthr_resume(condense_thread);
+
+       zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+       if (discard_thread != NULL && !zthr_isrunning(discard_thread))
+               zthr_resume(discard_thread);
 }
 
 static boolean_t
@@ -6667,6 +7658,9 @@ spa_sync(spa_t *spa, uint64_t txg)
        dsl_pool_t *dp = spa->spa_dsl_pool;
        objset_t *mos = spa->spa_meta_objset;
        bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+       metaslab_class_t *normal = spa_normal_class(spa);
+       metaslab_class_t *special = spa_special_class(spa);
+       metaslab_class_t *dedup = spa_dedup_class(spa);
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *vd;
        dmu_tx_t *tx;
@@ -6691,9 +7685,11 @@ spa_sync(spa_t *spa, uint64_t txg)
        spa->spa_syncing_txg = txg;
        spa->spa_sync_pass = 0;
 
-       mutex_enter(&spa->spa_alloc_lock);
-       VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-       mutex_exit(&spa->spa_alloc_lock);
+       for (int i = 0; i < spa->spa_alloc_count; i++) {
+               mutex_enter(&spa->spa_alloc_locks[i]);
+               VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+               mutex_exit(&spa->spa_alloc_locks[i]);
+       }
 
        /*
         * If there are any pending vdev state changes, convert them
@@ -6754,13 +7750,17 @@ spa_sync(spa_t *spa, uint64_t txg)
         * The max queue depth will not change in the middle of syncing
         * out this txg.
         */
-       uint64_t queue_depth_total = 0;
+       uint64_t slots_per_allocator = 0;
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *tvd = rvd->vdev_child[c];
                metaslab_group_t *mg = tvd->vdev_mg;
+               metaslab_class_t *mc;
+
+               if (mg == NULL || !metaslab_group_initialized(mg))
+                       continue;
 
-               if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
-                   !metaslab_group_initialized(mg))
+               mc = mg->mg_class;
+               if (mc != normal && mc != special && mc != dedup)
                        continue;
 
                /*
@@ -6768,17 +7768,29 @@ spa_sync(spa_t *spa, uint64_t txg)
                 * allocations look at mg_max_alloc_queue_depth, and async
                 * allocations all happen from spa_sync().
                 */
-               ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+               for (int i = 0; i < spa->spa_alloc_count; i++)
+                       ASSERT0(zfs_refcount_count(
+                           &(mg->mg_alloc_queue_depth[i])));
                mg->mg_max_alloc_queue_depth = max_queue_depth;
-               queue_depth_total += mg->mg_max_alloc_queue_depth;
+
+               for (int i = 0; i < spa->spa_alloc_count; i++) {
+                       mg->mg_cur_max_alloc_queue_depth[i] =
+                           zfs_vdev_def_queue_depth;
+               }
+               slots_per_allocator += zfs_vdev_def_queue_depth;
        }
-       metaslab_class_t *mc = spa_normal_class(spa);
-       ASSERT0(refcount_count(&mc->mc_alloc_slots));
-       mc->mc_alloc_max_slots = queue_depth_total;
-       mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 
-       ASSERT3U(mc->mc_alloc_max_slots, <=,
-           max_queue_depth * rvd->vdev_children);
+       for (int i = 0; i < spa->spa_alloc_count; i++) {
+               ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
+               ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
+               ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
+               normal->mc_alloc_max_slots[i] = slots_per_allocator;
+               special->mc_alloc_max_slots[i] = slots_per_allocator;
+               dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+       }
+       normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+       special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+       dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *vd = rvd->vdev_child[c];
@@ -6853,6 +7865,8 @@ spa_sync(spa_t *spa, uint64_t txg)
                                    txg));
                                ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
                                ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+                               ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
+                                   txg));
                                break;
                        }
                        spa_sync_deferred_frees(spa, tx);
@@ -6898,18 +7912,24 @@ spa_sync(spa_t *spa, uint64_t txg)
                spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
                if (list_is_empty(&spa->spa_config_dirty_list)) {
-                       vdev_t *svd[SPA_DVAS_PER_BP];
+                       vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
                        int svdcount = 0;
                        int children = rvd->vdev_children;
                        int c0 = spa_get_random(children);
 
                        for (int c = 0; c < children; c++) {
                                vd = rvd->vdev_child[(c0 + c) % children];
+
+                               /* Stop when revisiting the first vdev */
+                               if (c > 0 && svd[0] == vd)
+                                       break;
+
                                if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
                                    !vdev_is_concrete(vd))
                                        continue;
+
                                svd[svdcount++] = vd;
-                               if (svdcount == SPA_DVAS_PER_BP)
+                               if (svdcount == SPA_SYNC_MIN_VDEVS)
                                        break;
                        }
                        error = vdev_config_sync(svd, svdcount, txg);
@@ -6951,9 +7971,11 @@ spa_sync(spa_t *spa, uint64_t txg)
 
        dsl_pool_sync_done(dp, txg);
 
-       mutex_enter(&spa->spa_alloc_lock);
-       VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-       mutex_exit(&spa->spa_alloc_lock);
+       for (int i = 0; i < spa->spa_alloc_count; i++) {
+               mutex_enter(&spa->spa_alloc_locks[i]);
+               VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+               mutex_exit(&spa->spa_alloc_locks[i]);
+       }
 
        /*
         * Update usable space statistics.
@@ -6971,6 +7993,9 @@ spa_sync(spa_t *spa, uint64_t txg)
        ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
        ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
+       while (zfs_pause_spa_sync)
+               delay(1);
+
        spa->spa_sync_pass = 0;
 
        /*
@@ -7183,7 +8208,7 @@ spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
        spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
 /* state manipulation functions */
 EXPORT_SYMBOL(spa_open);
 EXPORT_SYMBOL(spa_open_rewind);
@@ -7240,7 +8265,7 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
 EXPORT_SYMBOL(spa_event_notify);
 #endif
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
 module_param(spa_load_verify_maxinflight, int, 0644);
 MODULE_PARM_DESC(spa_load_verify_maxinflight,
        "Max concurrent traversal I/Os while verifying pool during import -X");
@@ -7253,9 +8278,20 @@ module_param(spa_load_verify_data, int, 0644);
 MODULE_PARM_DESC(spa_load_verify_data,
        "Set to traverse data on pool import");
 
+module_param(spa_load_print_vdev_tree, int, 0644);
+MODULE_PARM_DESC(spa_load_print_vdev_tree,
+       "Print vdev tree to zfs_dbgmsg during pool import");
+
 /* CSTYLED */
 module_param(zio_taskq_batch_pct, uint, 0444);
 MODULE_PARM_DESC(zio_taskq_batch_pct,
        "Percentage of CPUs to run an IO worker thread");
 
+/* BEGIN CSTYLED */
+module_param(zfs_max_missing_tvds, ulong, 0644);
+MODULE_PARM_DESC(zfs_max_missing_tvds,
+       "Allow importing pool with up to this number of missing top-level vdevs"
+       " (in read-only mode)");
+/* END CSTYLED */
+
 #endif