]> git.proxmox.com Git - mirror_zfs.git/blobdiff - cmd/zdb/zdb.c
Get rid of space_map_update() for ms_synced_length
[mirror_zfs.git] / cmd / zdb / zdb.c
index 21113da2f03cf666a12601ea529714c2d21abeda..3d175dacafb29acb04807b2122e6c14b10717a99 100644 (file)
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
+#include <sys/dsl_scan.h>
 #include <zfs_comutil.h>
-#include <libzfs.h>
+
+#include <libnvpair.h>
+#include <libzutil.h>
 
 #include "zdb.h"
 
@@ -97,6 +100,7 @@ extern int zfs_recover;
 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
 extern int zfs_vdev_async_read_max_active;
 extern boolean_t spa_load_verify_dryrun;
+extern int zfs_reconstruct_indirect_combinations_max;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
@@ -105,11 +109,13 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 uint64_t *zopt_object = NULL;
 static unsigned zopt_objects = 0;
-libzfs_handle_t *g_zfs;
 uint64_t max_inflight = 1000;
 static int leaked_objects = 0;
+static range_tree_t *mos_refd_objs;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
+static void mos_obj_refd(uint64_t);
+static void mos_obj_refd_multiple(uint64_t);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -210,6 +216,8 @@ usage(void)
            "dump all read blocks into specified directory\n");
        (void) fprintf(stderr, "        -X attempt extreme rewind (does not "
            "work with dataset)\n");
+       (void) fprintf(stderr, "        -Y attempt all reconstruction "
+           "combinations for split blocks\n");
        (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
            "to make only that option verbose\n");
        (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
@@ -221,6 +229,7 @@ dump_debug_buffer(void)
 {
        if (dump_opt['G']) {
                (void) printf("\n");
+               (void) fflush(stdout);
                zfs_dbgmsg_print("zdb");
        }
 }
@@ -696,19 +705,20 @@ get_metaslab_refcount(vdev_t *vd)
 static int
 get_obsolete_refcount(vdev_t *vd)
 {
+       uint64_t obsolete_sm_object;
        int refcount = 0;
 
-       uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
-       if (vd->vdev_top == vd && obsolete_sm_obj != 0) {
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+       if (vd->vdev_top == vd && obsolete_sm_object != 0) {
                dmu_object_info_t doi;
                VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
-                   obsolete_sm_obj, &doi));
+                   obsolete_sm_object, &doi));
                if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
                        refcount++;
                }
        } else {
                ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
-               ASSERT3U(obsolete_sm_obj, ==, 0);
+               ASSERT3U(obsolete_sm_object, ==, 0);
        }
        for (unsigned c = 0; c < vd->vdev_children; c++) {
                refcount += get_obsolete_refcount(vd->vdev_child[c]);
@@ -783,18 +793,21 @@ dump_spacemap(objset_t *os, space_map_t *sm)
                return;
 
        (void) printf("space map object %llu:\n",
-           (longlong_t)sm->sm_phys->smp_object);
-       (void) printf("  smp_objsize = 0x%llx\n",
-           (longlong_t)sm->sm_phys->smp_objsize);
+           (longlong_t)sm->sm_object);
+       (void) printf("  smp_length = 0x%llx\n",
+           (longlong_t)sm->sm_phys->smp_length);
        (void) printf("  smp_alloc = 0x%llx\n",
            (longlong_t)sm->sm_phys->smp_alloc);
 
+       if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+               return;
+
        /*
         * Print out the freelist entries in both encoded and decoded form.
         */
        uint8_t mapshift = sm->sm_shift;
        int64_t alloc = 0;
-       uint64_t word;
+       uint64_t word, entry_id = 0;
        for (uint64_t offset = 0; offset < space_map_length(sm);
            offset += sizeof (word)) {
 
@@ -802,11 +815,12 @@ dump_spacemap(objset_t *os, space_map_t *sm)
                    sizeof (word), &word, DMU_READ_PREFETCH));
 
                if (sm_entry_is_debug(word)) {
-                       (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
-                           (u_longlong_t)(offset / sizeof (word)),
+                       (void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
+                           (u_longlong_t)entry_id,
                            ddata[SM_DEBUG_ACTION_DECODE(word)],
                            (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
                            (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+                       entry_id++;
                        continue;
                }
 
@@ -844,7 +858,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 
                (void) printf("\t    [%6llu]    %c  range:"
                    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
-                   (u_longlong_t)(offset / sizeof (word)),
+                   (u_longlong_t)entry_id,
                    entry_type, (u_longlong_t)entry_off,
                    (u_longlong_t)(entry_off + entry_run),
                    (u_longlong_t)entry_run,
@@ -854,6 +868,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)
                        alloc += entry_run;
                else
                        alloc -= entry_run;
+               entry_id++;
        }
        if ((uint64_t)alloc != space_map_allocated(sm)) {
                (void) printf("space_map_object alloc (%lld) INCONSISTENT "
@@ -900,11 +915,8 @@ dump_metaslab(metaslab_t *msp)
 
        if (dump_opt['m'] > 2 && !dump_opt['L']) {
                mutex_enter(&msp->ms_lock);
-               metaslab_load_wait(msp);
-               if (!msp->ms_loaded) {
-                       VERIFY0(metaslab_load(msp));
-                       range_tree_stat_verify(msp->ms_allocatable);
-               }
+               VERIFY0(metaslab_load(msp));
+               range_tree_stat_verify(msp->ms_allocatable);
                dump_metaslab_stats(msp);
                metaslab_unload(msp);
                mutex_exit(&msp->ms_lock);
@@ -922,11 +934,8 @@ dump_metaslab(metaslab_t *msp)
                    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
        }
 
-       if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
-               ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
-
-               dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
-       }
+       ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+       dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 }
 
 static void
@@ -1051,7 +1060,8 @@ print_vdev_indirect(vdev_t *vd)
        }
        (void) printf("\n");
 
-       uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
+       uint64_t obsolete_sm_object;
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
        if (obsolete_sm_object != 0) {
                objset_t *mos = vd->vdev_spa->spa_meta_objset;
                (void) printf("obsolete space map object %llu:\n",
@@ -1590,6 +1600,8 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
        DO(CHILD_RSRV);
        DO(REFRSRV);
 #undef DO
+       (void) printf("\t\tclones = %llu\n",
+           (u_longlong_t)dd->dd_clones);
 }
 
 /*ARGSUSED*/
@@ -1772,6 +1784,33 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
        }
 }
 
+static void
+bpobj_count_refd(bpobj_t *bpo)
+{
+       mos_obj_refd(bpo->bpo_object);
+
+       if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+               mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
+               for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+                       uint64_t subobj;
+                       bpobj_t subbpo;
+                       int error;
+                       VERIFY0(dmu_read(bpo->bpo_os,
+                           bpo->bpo_phys->bpo_subobjs,
+                           i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+                       error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+                       if (error != 0) {
+                               (void) printf("ERROR %u while trying to open "
+                                   "subobj id %llu\n",
+                                   error, (u_longlong_t)subobj);
+                               continue;
+                       }
+                       bpobj_count_refd(&subbpo);
+                       bpobj_close(&subbpo);
+               }
+       }
+}
+
 static void
 dump_deadlist(dsl_deadlist_t *dl)
 {
@@ -1780,6 +1819,23 @@ dump_deadlist(dsl_deadlist_t *dl)
        char bytes[32];
        char comp[32];
        char uncomp[32];
+       uint64_t empty_bpobj =
+           dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj;
+
+       /* force the tree to be loaded */
+       dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
+
+       if (dl->dl_oldfmt) {
+               if (dl->dl_bpobj.bpo_object != empty_bpobj)
+                       bpobj_count_refd(&dl->dl_bpobj);
+       } else {
+               mos_obj_refd(dl->dl_object);
+               for (dle = avl_first(&dl->dl_tree); dle;
+                   dle = AVL_NEXT(&dl->dl_tree, dle)) {
+                       if (dle->dle_bpobj.bpo_object != empty_bpobj)
+                               bpobj_count_refd(&dle->dle_bpobj);
+               }
+       }
 
        /* make sure nicenum has enough space */
        CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
@@ -1805,9 +1861,6 @@ dump_deadlist(dsl_deadlist_t *dl)
 
        (void) printf("\n");
 
-       /* force the tree to be loaded */
-       dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
-
        for (dle = avl_first(&dl->dl_tree); dle;
            dle = AVL_NEXT(&dl->dl_tree, dle)) {
                if (dump_opt['d'] >= 5) {
@@ -1822,7 +1875,6 @@ dump_deadlist(dsl_deadlist_t *dl)
                        (void) printf("mintxg %llu -> obj %llu\n",
                            (longlong_t)dle->dle_mintxg,
                            (longlong_t)dle->dle_bpobj.bpo_object);
-
                }
        }
 }
@@ -2320,6 +2372,36 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
                dnode_rele(dn, FTAG);
 }
 
+static void
+count_dir_mos_objects(dsl_dir_t *dd)
+{
+       mos_obj_refd(dd->dd_object);
+       mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
+       mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
+       mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
+       mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
+
+       /*
+        * The dd_crypto_obj can be referenced by multiple dsl_dir's.
+        * Ignore the references after the first one.
+        */
+       mos_obj_refd_multiple(dd->dd_crypto_obj);
+}
+
+static void
+count_ds_mos_objects(dsl_dataset_t *ds)
+{
+       mos_obj_refd(ds->ds_object);
+       mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
+       mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
+       mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
+       mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+
+       if (!dsl_dataset_is_snapshot(ds)) {
+               count_dir_mos_objects(ds->ds_dir);
+       }
+}
+
 static const char *objset_types[DMU_OST_NUMTYPES] = {
        "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
@@ -2375,10 +2457,11 @@ dump_dir(objset_t *os)
        dmu_objset_name(os, osname);
 
        (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
-           "%s, %llu objects%s\n",
+           "%s, %llu objects%s%s\n",
            osname, type, (u_longlong_t)dmu_objset_id(os),
            (u_longlong_t)dds.dds_creation_txg,
-           numbuf, (u_longlong_t)usedobjs, blkbuf);
+           numbuf, (u_longlong_t)usedobjs, blkbuf,
+           (dds.dds_inconsistent) ? " (inconsistent)" : "");
 
        if (zopt_objects != 0) {
                for (i = 0; i < zopt_objects; i++)
@@ -2399,6 +2482,7 @@ dump_dir(objset_t *os)
                        (void) printf("ds_remap_deadlist:\n");
                        dump_deadlist(&ds->ds_remap_deadlist);
                }
+               count_ds_mos_objects(ds);
        }
 
        if (verbosity < 2)
@@ -2440,15 +2524,15 @@ dump_dir(objset_t *os)
        (void) printf("\tPercent empty: %10lf\n",
            (double)(max_slot_used - total_slots_used)*100 /
            (double)max_slot_used);
-
-       ASSERT3U(object_count, ==, usedobjs);
-
        (void) printf("\n");
 
        if (error != ESRCH) {
                (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
                abort();
        }
+
+       ASSERT3U(object_count, ==, usedobjs);
+
        if (leaked_objects != 0) {
                (void) printf("%d potentially leaked objects detected\n",
                    leaked_objects);
@@ -3140,7 +3224,7 @@ dump_one_dir(const char *dsname, void *arg)
                return (0);
 
        for (f = 0; f < SPA_FEATURES; f++) {
-               if (!dmu_objset_ds(os)->ds_feature_inuse[f])
+               if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
                        continue;
                ASSERT(spa_feature_table[f].fi_flags &
                    ZFEATURE_FLAG_PER_DATASET);
@@ -3517,6 +3601,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
+       if (dump_opt['L'])
+               return;
+
        if (spa->spa_vdev_removal == NULL)
                return;
 
@@ -3595,9 +3682,11 @@ zdb_load_obsolete_counts(vdev_t *vd)
        spa_t *spa = vd->vdev_spa;
        spa_condensing_indirect_phys_t *scip =
            &spa->spa_condensing_indirect_phys;
+       uint64_t obsolete_sm_object;
        uint32_t *counts;
 
-       EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
+       VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+       EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
        counts = vdev_indirect_mapping_load_obsolete_counts(vim);
        if (vd->vdev_obsolete_sm != NULL) {
                vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
@@ -3608,7 +3697,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
                space_map_t *prev_obsolete_sm = NULL;
                VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
                    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
-               space_map_update(prev_obsolete_sm);
                vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
                    prev_obsolete_sm);
                space_map_close(prev_obsolete_sm);
@@ -3624,6 +3712,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
        int error;
        int p;
 
+       ASSERT(!dump_opt['L']);
+
        bzero(&ddb, sizeof (ddb));
        while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
                blkptr_t blk;
@@ -3647,12 +3737,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
                                zcb->zcb_dedup_blocks++;
                        }
                }
-               if (!dump_opt['L']) {
-                       ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
-                       ddt_enter(ddt);
-                       VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
-                       ddt_exit(ddt);
-               }
+               ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+               ddt_enter(ddt);
+               VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+               ddt_exit(ddt);
        }
 
        ASSERT(error == ENOENT);
@@ -3744,9 +3832,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 
        VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
            checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-       space_map_update(checkpoint_sm);
 
        VERIFY0(space_map_iterate(checkpoint_sm,
+           space_map_length(checkpoint_sm),
            checkpoint_sm_exclude_entry_cb, &cseea));
        space_map_close(checkpoint_sm);
 
@@ -3756,6 +3844,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
+       ASSERT(!dump_opt['L']);
+
        vdev_t *rvd = spa->spa_root_vdev;
        for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
@@ -3852,6 +3942,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
+       ASSERT(!dump_opt['L']);
+
        vdev_t *rvd = spa->spa_root_vdev;
        for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                vdev_t *vd = rvd->vdev_child[c];
@@ -3898,67 +3990,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
        zcb->zcb_spa = spa;
 
-       if (!dump_opt['L']) {
-               dsl_pool_t *dp = spa->spa_dsl_pool;
-               vdev_t *rvd = spa->spa_root_vdev;
+       if (dump_opt['L'])
+               return;
+
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       vdev_t *rvd = spa->spa_root_vdev;
 
-               /*
-                * We are going to be changing the meaning of the metaslab's
-                * ms_allocatable.  Ensure that the allocator doesn't try to
-                * use the tree.
-                */
-               spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
-               spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+       /*
+        * We are going to be changing the meaning of the metaslab's
+        * ms_allocatable.  Ensure that the allocator doesn't try to
+        * use the tree.
+        */
+       spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+       spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 
-               zcb->zcb_vd_obsolete_counts =
-                   umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
-                   UMEM_NOFAIL);
+       zcb->zcb_vd_obsolete_counts =
+           umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+           UMEM_NOFAIL);
 
-               /*
-                * For leak detection, we overload the ms_allocatable trees
-                * to contain allocated segments instead of free segments.
-                * As a result, we can't use the normal metaslab_load/unload
-                * interfaces.
-                */
-               zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
-               load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+       /*
+        * For leak detection, we overload the ms_allocatable trees
+        * to contain allocated segments instead of free segments.
+        * As a result, we can't use the normal metaslab_load/unload
+        * interfaces.
+        */
+       zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+       load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
-               /*
-                * On load_concrete_ms_allocatable_trees() we loaded all the
-                * allocated entries from the ms_sm to the ms_allocatable for
-                * each metaslab. If the pool has a checkpoint or is in the
-                * middle of discarding a checkpoint, some of these blocks
-                * may have been freed but their ms_sm may not have been
-                * updated because they are referenced by the checkpoint. In
-                * order to avoid false-positives during leak-detection, we
-                * go through the vdev's checkpoint space map and exclude all
-                * its entries from their relevant ms_allocatable.
-                *
-                * We also aggregate the space held by the checkpoint and add
-                * it to zcb_checkpoint_size.
-                *
-                * Note that at this point we are also verifying that all the
-                * entries on the checkpoint_sm are marked as allocated in
-                * the ms_sm of their relevant metaslab.
-                * [see comment in checkpoint_sm_exclude_entry_cb()]
-                */
-               zdb_leak_init_exclude_checkpoint(spa, zcb);
+       /*
+        * On load_concrete_ms_allocatable_trees() we loaded all the
+        * allocated entries from the ms_sm to the ms_allocatable for
+        * each metaslab. If the pool has a checkpoint or is in the
+        * middle of discarding a checkpoint, some of these blocks
+        * may have been freed but their ms_sm may not have been
+        * updated because they are referenced by the checkpoint. In
+        * order to avoid false-positives during leak-detection, we
+        * go through the vdev's checkpoint space map and exclude all
+        * its entries from their relevant ms_allocatable.
+        *
+        * We also aggregate the space held by the checkpoint and add
+        * it to zcb_checkpoint_size.
+        *
+        * Note that at this point we are also verifying that all the
+        * entries on the checkpoint_sm are marked as allocated in
+        * the ms_sm of their relevant metaslab.
+        * [see comment in checkpoint_sm_exclude_entry_cb()]
+        */
+       zdb_leak_init_exclude_checkpoint(spa, zcb);
+       ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
 
-               /* for cleaner progress output */
-               (void) fprintf(stderr, "\n");
+       /* for cleaner progress output */
+       (void) fprintf(stderr, "\n");
 
-               if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
-                       ASSERT(spa_feature_is_enabled(spa,
-                           SPA_FEATURE_DEVICE_REMOVAL));
-                       (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
-                           increment_indirect_mapping_cb, zcb, NULL);
-               }
-       } else {
-               /*
-                * If leak tracing is disabled, we still need to consider
-                * any checkpointed space in our space verification.
-                */
-               zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
+       if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+               ASSERT(spa_feature_is_enabled(spa,
+                   SPA_FEATURE_DEVICE_REMOVAL));
+               (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+                   increment_indirect_mapping_cb, zcb, NULL);
        }
 
        spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -3972,6 +4060,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
        boolean_t leaks = B_FALSE;
        vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
        uint64_t total_leaked = 0;
+       boolean_t are_precise = B_FALSE;
 
        ASSERT(vim != NULL);
 
@@ -3999,9 +4088,9 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
                    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
                ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
                    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
-               if (bytes_leaked != 0 &&
-                   (vdev_obsolete_counts_are_precise(vd) ||
-                   dump_opt['d'] >= 5)) {
+
+               VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+               if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
                        (void) printf("obsolete indirect mapping count "
                            "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
                            (u_longlong_t)vd->vdev_id,
@@ -4012,7 +4101,8 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
                total_leaked += ABS(bytes_leaked);
        }
 
-       if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) {
+       VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+       if (!are_precise && total_leaked > 0) {
                int pct_leaked = total_leaked * 100 /
                    vdev_indirect_mapping_bytes_mapped(vim);
                (void) printf("cannot verify obsolete indirect mapping "
@@ -4039,51 +4129,54 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
+       if (dump_opt['L'])
+               return (B_FALSE);
+
        boolean_t leaks = B_FALSE;
-       if (!dump_opt['L']) {
-               vdev_t *rvd = spa->spa_root_vdev;
-               for (unsigned c = 0; c < rvd->vdev_children; c++) {
-                       vdev_t *vd = rvd->vdev_child[c];
-                       ASSERTV(metaslab_group_t *mg = vd->vdev_mg);
-
-                       if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
-                               leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
-                       }
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (unsigned c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+               ASSERTV(metaslab_group_t *mg = vd->vdev_mg);
+
+               if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+                       leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+               }
 
-                       for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
-                               metaslab_t *msp = vd->vdev_ms[m];
-                               ASSERT3P(mg, ==, msp->ms_group);
+               for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+                       metaslab_t *msp = vd->vdev_ms[m];
+                       ASSERT3P(mg, ==, msp->ms_group);
 
-                               /*
-                                * ms_allocatable has been overloaded
-                                * to contain allocated segments. Now that
-                                * we finished traversing all blocks, any
-                                * block that remains in the ms_allocatable
-                                * represents an allocated block that we
-                                * did not claim during the traversal.
-                                * Claimed blocks would have been removed
-                                * from the ms_allocatable.  For indirect
-                                * vdevs, space remaining in the tree
-                                * represents parts of the mapping that are
-                                * not referenced, which is not a bug.
-                                */
-                               if (vd->vdev_ops == &vdev_indirect_ops) {
-                                       range_tree_vacate(msp->ms_allocatable,
-                                           NULL, NULL);
-                               } else {
-                                       range_tree_vacate(msp->ms_allocatable,
-                                           zdb_leak, vd);
-                               }
+                       /*
+                        * ms_allocatable has been overloaded
+                        * to contain allocated segments. Now that
+                        * we finished traversing all blocks, any
+                        * block that remains in the ms_allocatable
+                        * represents an allocated block that we
+                        * did not claim during the traversal.
+                        * Claimed blocks would have been removed
+                        * from the ms_allocatable.  For indirect
+                        * vdevs, space remaining in the tree
+                        * represents parts of the mapping that are
+                        * not referenced, which is not a bug.
+                        */
+                       if (vd->vdev_ops == &vdev_indirect_ops) {
+                               range_tree_vacate(msp->ms_allocatable,
+                                   NULL, NULL);
+                       } else {
+                               range_tree_vacate(msp->ms_allocatable,
+                                   zdb_leak, vd);
+                       }
 
-                               if (msp->ms_loaded)
-                                       msp->ms_loaded = B_FALSE;
+                       if (msp->ms_loaded) {
+                               msp->ms_loaded = B_FALSE;
                        }
                }
-
-               umem_free(zcb->zcb_vd_obsolete_counts,
-                   rvd->vdev_children * sizeof (uint32_t *));
-               zcb->zcb_vd_obsolete_counts = NULL;
        }
+
+       umem_free(zcb->zcb_vd_obsolete_counts,
+           rvd->vdev_children * sizeof (uint32_t *));
+       zcb->zcb_vd_obsolete_counts = NULL;
+
        return (leaks);
 }
 
@@ -4124,12 +4217,16 @@ dump_block_stats(spa_t *spa)
            !dump_opt['L'] ? "nothing leaked " : "");
 
        /*
-        * Load all space maps as SM_ALLOC maps, then traverse the pool
-        * claiming each block we discover.  If the pool is perfectly
-        * consistent, the space maps will be empty when we're done.
-        * Anything left over is a leak; any block we can't claim (because
-        * it's not part of any space map) is a double allocation,
-        * reference to a freed block, or an unclaimed log block.
+        * When leak detection is enabled we load all space maps as SM_ALLOC
+        * maps, then traverse the pool claiming each block we discover. If
+        * the pool is perfectly consistent, the segment trees will be empty
+        * when we're done. Anything left over is a leak; any block we can't
+        * claim (because it's not part of any space map) is a double
+        * allocation, reference to a freed block, or an unclaimed log block.
+        *
+        * When leak detection is disabled (-L option) we still traverse the
+        * pool claiming each block we discover, but we skip opening any space
+        * maps.
         */
        bzero(&zcb, sizeof (zdb_cb_t));
        zdb_leak_init(spa, &zcb);
@@ -4210,11 +4307,10 @@ dump_block_stats(spa_t *spa)
        total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
            zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
 
-       if (total_found == total_alloc) {
-               if (!dump_opt['L'])
-                       (void) printf("\n\tNo leaks (block sum matches space"
-                           " maps exactly)\n");
-       } else {
+       if (total_found == total_alloc && !dump_opt['L']) {
+               (void) printf("\n\tNo leaks (block sum matches space"
+                   " maps exactly)\n");
+       } else if (!dump_opt['L']) {
                (void) printf("block traversal size %llu != alloc %llu "
                    "(%s %lld)\n",
                    (u_longlong_t)total_found,
@@ -4554,7 +4650,6 @@ verify_device_removal_feature_counts(spa_t *spa)
                            spa->spa_meta_objset,
                            scip->scip_prev_obsolete_sm_object,
                            0, vd->vdev_asize, 0));
-                       space_map_update(prev_obsolete_sm);
                        dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
                        (void) printf("\n");
                        space_map_close(prev_obsolete_sm);
@@ -4576,11 +4671,17 @@ verify_device_removal_feature_counts(spa_t *spa)
                                obsolete_counts_count++;
                        }
                }
-               if (vdev_obsolete_counts_are_precise(vd)) {
+
+               boolean_t are_precise;
+               VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+               if (are_precise) {
                        ASSERT(vic->vic_mapping_object != 0);
                        precise_vdev_count++;
                }
-               if (vdev_obsolete_sm_object(vd) != 0) {
+
+               uint64_t obsolete_sm_object;
+               VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+               if (obsolete_sm_object != 0) {
                        ASSERT(vic->vic_mapping_object != 0);
                        obsolete_sm_count++;
                }
@@ -4766,7 +4867,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
         * their respective ms_allocateable trees should not contain them.
         */
        mutex_enter(&ms->ms_lock);
-       range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+       range_tree_verify_not_present(ms->ms_allocatable,
+           sme->sme_offset, sme->sme_run);
        mutex_exit(&ms->ms_lock);
 
        return (0);
@@ -4829,7 +4931,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
                VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
                    checkpoint_sm_obj, 0, current_vd->vdev_asize,
                    current_vd->vdev_ashift));
-               space_map_update(checkpoint_sm);
 
                verify_checkpoint_sm_entry_cb_arg_t vcsec;
                vcsec.vcsec_vd = ckpoint_vd;
@@ -4837,6 +4938,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
                vcsec.vcsec_num_entries =
                    space_map_length(checkpoint_sm) / sizeof (uint64_t);
                VERIFY0(space_map_iterate(checkpoint_sm,
+                   space_map_length(checkpoint_sm),
                    verify_checkpoint_sm_entry_cb, &vcsec));
                if (dump_opt['m'] > 3)
                        dump_spacemap(current->spa_meta_objset, checkpoint_sm);
@@ -4917,7 +5019,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
                         * are part of the checkpoint were freed by mistake.
                         */
                        range_tree_walk(ckpoint_msp->ms_allocatable,
-                           (range_tree_func_t *)range_tree_verify,
+                           (range_tree_func_t *)range_tree_verify_not_present,
                            current_msp->ms_allocatable);
                }
        }
@@ -4929,6 +5031,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
+       ASSERT(!dump_opt['L']);
+
        spa_t *checkpoint_spa;
        char *checkpoint_pool;
        nvlist_t *config = NULL;
@@ -4994,7 +5098,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa)
 
                VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
                    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-               space_map_update(checkpoint_sm);
                dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
                space_map_close(checkpoint_sm);
        }
@@ -5043,6 +5146,170 @@ verify_checkpoint(spa_t *spa)
        return (error);
 }
 
+/* ARGSUSED */
+static void
+mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
+{
+       for (uint64_t i = start; i < size; i++) {
+               (void) printf("MOS object %llu referenced but not allocated\n",
+                   (u_longlong_t)i);
+       }
+}
+
+static void
+mos_obj_refd(uint64_t obj)
+{
+       if (obj != 0 && mos_refd_objs != NULL)
+               range_tree_add(mos_refd_objs, obj, 1);
+}
+
+/*
+ * Call on a MOS object that may already have been referenced.
+ */
+static void
+mos_obj_refd_multiple(uint64_t obj)
+{
+       if (obj != 0 && mos_refd_objs != NULL &&
+           !range_tree_contains(mos_refd_objs, obj, 1))
+               range_tree_add(mos_refd_objs, obj, 1);
+}
+
+static void
+mos_leak_vdev(vdev_t *vd)
+{
+       mos_obj_refd(vd->vdev_dtl_object);
+       mos_obj_refd(vd->vdev_ms_array);
+       mos_obj_refd(vd->vdev_top_zap);
+       mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
+       mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
+       mos_obj_refd(vd->vdev_leaf_zap);
+       if (vd->vdev_checkpoint_sm != NULL)
+               mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
+       if (vd->vdev_indirect_mapping != NULL) {
+               mos_obj_refd(vd->vdev_indirect_mapping->
+                   vim_phys->vimp_counts_object);
+       }
+       if (vd->vdev_obsolete_sm != NULL)
+               mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
+
+       for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+               metaslab_t *ms = vd->vdev_ms[m];
+               mos_obj_refd(space_map_object(ms->ms_sm));
+       }
+
+       for (uint64_t c = 0; c < vd->vdev_children; c++) {
+               mos_leak_vdev(vd->vdev_child[c]);
+       }
+}
+
+static int
+dump_mos_leaks(spa_t *spa)
+{
+       int rv = 0;
+       objset_t *mos = spa->spa_meta_objset;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+
+       /* Visit and mark all referenced objects in the MOS */
+
+       mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
+       mos_obj_refd(spa->spa_pool_props_object);
+       mos_obj_refd(spa->spa_config_object);
+       mos_obj_refd(spa->spa_ddt_stat_object);
+       mos_obj_refd(spa->spa_feat_desc_obj);
+       mos_obj_refd(spa->spa_feat_enabled_txg_obj);
+       mos_obj_refd(spa->spa_feat_for_read_obj);
+       mos_obj_refd(spa->spa_feat_for_write_obj);
+       mos_obj_refd(spa->spa_history);
+       mos_obj_refd(spa->spa_errlog_last);
+       mos_obj_refd(spa->spa_errlog_scrub);
+       mos_obj_refd(spa->spa_all_vdev_zaps);
+       mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
+       mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
+       mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
+       bpobj_count_refd(&spa->spa_deferred_bpobj);
+       mos_obj_refd(dp->dp_empty_bpobj);
+       bpobj_count_refd(&dp->dp_obsolete_bpobj);
+       bpobj_count_refd(&dp->dp_free_bpobj);
+       mos_obj_refd(spa->spa_l2cache.sav_object);
+       mos_obj_refd(spa->spa_spares.sav_object);
+
+       mos_obj_refd(spa->spa_condensing_indirect_phys.
+           scip_next_mapping_object);
+       mos_obj_refd(spa->spa_condensing_indirect_phys.
+           scip_prev_obsolete_sm_object);
+       if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
+               vdev_indirect_mapping_t *vim =
+                   vdev_indirect_mapping_open(mos,
+                   spa->spa_condensing_indirect_phys.scip_next_mapping_object);
+               mos_obj_refd(vim->vim_phys->vimp_counts_object);
+               vdev_indirect_mapping_close(vim);
+       }
+
+       if (dp->dp_origin_snap != NULL) {
+               dsl_dataset_t *ds;
+
+               dsl_pool_config_enter(dp, FTAG);
+               VERIFY0(dsl_dataset_hold_obj(dp,
+                   dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
+                   FTAG, &ds));
+               count_ds_mos_objects(ds);
+               dump_deadlist(&ds->ds_deadlist);
+               dsl_dataset_rele(ds, FTAG);
+               dsl_pool_config_exit(dp, FTAG);
+
+               count_ds_mos_objects(dp->dp_origin_snap);
+               dump_deadlist(&dp->dp_origin_snap->ds_deadlist);
+       }
+       count_dir_mos_objects(dp->dp_mos_dir);
+       if (dp->dp_free_dir != NULL)
+               count_dir_mos_objects(dp->dp_free_dir);
+       if (dp->dp_leak_dir != NULL)
+               count_dir_mos_objects(dp->dp_leak_dir);
+
+       mos_leak_vdev(spa->spa_root_vdev);
+
+       for (uint64_t class = 0; class < DDT_CLASSES; class++) {
+               for (uint64_t type = 0; type < DDT_TYPES; type++) {
+                       for (uint64_t cksum = 0;
+                           cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
+                               ddt_t *ddt = spa->spa_ddt[cksum];
+                               mos_obj_refd(ddt->ddt_object[type][class]);
+                       }
+               }
+       }
+
+       /*
+        * Visit all allocated objects and make sure they are referenced.
+        */
+       uint64_t object = 0;
+       while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
+               if (range_tree_contains(mos_refd_objs, object, 1)) {
+                       range_tree_remove(mos_refd_objs, object, 1);
+               } else {
+                       dmu_object_info_t doi;
+                       const char *name;
+                       dmu_object_info(mos, object, &doi);
+                       if (doi.doi_type & DMU_OT_NEWTYPE) {
+                               dmu_object_byteswap_t bswap =
+                                   DMU_OT_BYTESWAP(doi.doi_type);
+                               name = dmu_ot_byteswap[bswap].ob_name;
+                       } else {
+                               name = dmu_ot[doi.doi_type].ot_name;
+                       }
+
+                       (void) printf("MOS object %llu (%s) leaked\n",
+                           (u_longlong_t)object, name);
+                       rv = 2;
+               }
+       }
+       (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
+       if (!range_tree_is_empty(mos_refd_objs))
+               rv = 2;
+       range_tree_vacate(mos_refd_objs, NULL, NULL);
+       range_tree_destroy(mos_refd_objs);
+       return (rv);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
@@ -5075,8 +5342,9 @@ dump_zpool(spa_t *spa)
 
        if (dump_opt['d'] || dump_opt['i']) {
                spa_feature_t f;
-
+               mos_refd_objs = range_tree_create(NULL, NULL);
                dump_dir(dp->dp_meta_objset);
+
                if (dump_opt['d'] >= 3) {
                        dsl_pool_t *dp = spa->spa_dsl_pool;
                        dump_full_bpobj(&spa->spa_deferred_bpobj,
@@ -5103,6 +5371,9 @@ dump_zpool(spa_t *spa)
                (void) dmu_objset_find(spa_name(spa), dump_one_dir,
                    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
+               if (rc == 0 && !dump_opt['L'])
+                       rc = dump_mos_leaks(spa);
+
                for (f = 0; f < SPA_FEATURES; f++) {
                        uint64_t refcount;
 
@@ -5134,6 +5405,7 @@ dump_zpool(spa_t *spa)
                        rc = verify_device_removal_feature_counts(spa);
                }
        }
+
        if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
                rc = dump_block_stats(spa);
 
@@ -5610,7 +5882,7 @@ main(int argc, char **argv)
                spa_config_path = spa_config_path_env;
 
        while ((c = getopt(argc, argv,
-           "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+           "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) {
                switch (c) {
                case 'b':
                case 'c':
@@ -5642,6 +5914,10 @@ main(int argc, char **argv)
                case 'X':
                        dump_opt[c]++;
                        break;
+               case 'Y':
+                       zfs_reconstruct_indirect_combinations_max = INT_MAX;
+                       zfs_deadman_enabled = 0;
+                       break;
                /* NB: Sort single match options below. */
                case 'I':
                        max_inflight = strtoull(optarg, NULL, 0);
@@ -5736,10 +6012,6 @@ main(int argc, char **argv)
        spa_load_verify_dryrun = B_TRUE;
 
        kernel_init(FREAD);
-       if ((g_zfs = libzfs_init()) == NULL) {
-               (void) fprintf(stderr, "%s", libzfs_error_init(errno));
-               return (1);
-       }
 
        if (dump_all)
                verbose = MAX(verbose, 1);
@@ -5797,17 +6069,6 @@ main(int argc, char **argv)
        error = 0;
        target = argv[0];
 
-       char *checkpoint_pool = NULL;
-       char *checkpoint_target = NULL;
-       if (dump_opt['k']) {
-               checkpoint_pool = import_checkpointed_state(target, cfg,
-                   &checkpoint_target);
-
-               if (checkpoint_target != NULL)
-                       target = checkpoint_target;
-
-       }
-
        if (strpbrk(target, "/@") != NULL) {
                size_t targetlen;
 
@@ -5829,7 +6090,8 @@ main(int argc, char **argv)
                args.path = searchdirs;
                args.can_be_active = B_TRUE;
 
-               error = zpool_tryimport(g_zfs, target_pool, &cfg, &args);
+               error = zpool_find_config(NULL, target_pool, &cfg, &args,
+                   &libzpool_config_ops);
 
                if (error == 0) {
 
@@ -5853,6 +6115,24 @@ main(int argc, char **argv)
                }
        }
 
+       /*
+        * import_checkpointed_state makes the assumption that the
+        * target pool that we pass it is already part of the spa
+        * namespace. Because of that we need to make sure to call
+        * it always after the -e option has been processed, which
+        * imports the pool to the namespace if it's not in the
+        * cachefile.
+        */
+       char *checkpoint_pool = NULL;
+       char *checkpoint_target = NULL;
+       if (dump_opt['k']) {
+               checkpoint_pool = import_checkpointed_state(target, cfg,
+                   &checkpoint_target);
+
+               if (checkpoint_target != NULL)
+                       target = checkpoint_target;
+       }
+
        if (target_pool != target)
                free(target_pool);
 
@@ -5961,7 +6241,6 @@ main(int argc, char **argv)
 
        dump_debug_buffer();
 
-       libzfs_fini(g_zfs);
        kernel_fini();
 
        return (error);