+ return (0);
+}
+
+typedef struct zdb_ddt_entry {
+ ddt_key_t zdde_key;
+ uint64_t zdde_ref_blocks;
+ uint64_t zdde_ref_lsize;
+ uint64_t zdde_ref_psize;
+ uint64_t zdde_ref_dsize;
+ avl_node_t zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ avl_tree_t *t = arg;
+ avl_index_t where;
+ zdb_ddt_entry_t *zdde, zdde_search;
+
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return (0);
+
+ if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+ (void) printf("traversing objset %llu, %llu objects, "
+ "%lu blocks so far\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)BP_GET_FILL(bp),
+ avl_numnodes(t));
+ }
+
+ if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+ BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+ return (0);
+
+ ddt_key_fill(&zdde_search.zdde_key, bp);
+
+ zdde = avl_find(t, &zdde_search, &where);
+
+ if (zdde == NULL) {
+ zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+ zdde->zdde_key = zdde_search.zdde_key;
+ avl_insert(t, zdde, where);
+ }
+
+ zdde->zdde_ref_blocks += 1;
+ zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+ zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+ zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+ return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+ avl_tree_t t;
+ void *cookie = NULL;
+ zdb_ddt_entry_t *zdde;
+ ddt_histogram_t ddh_total;
+ ddt_stat_t dds_total;
+
+ bzero(&ddh_total, sizeof (ddh_total));
+ bzero(&dds_total, sizeof (dds_total));
+ avl_create(&t, ddt_entry_compare,
+ sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+ ddt_stat_t dds;
+ uint64_t refcnt = zdde->zdde_ref_blocks;
+ ASSERT(refcnt != 0);
+
+ dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+ dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+ dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+ dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+ dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+ dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+ dds.dds_ref_psize = zdde->zdde_ref_psize;
+ dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+ ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
+ &dds, 0);
+
+ umem_free(zdde, sizeof (*zdde));
+ }
+
+ avl_destroy(&t);
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ (void) printf("Simulated DDT histogram:\n");
+
+ zpool_dump_ddt(&dds_total, &ddh_total);
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static int
+verify_device_removal_feature_counts(spa_t *spa)
+{
+ uint64_t dr_feature_refcount = 0;
+ uint64_t oc_feature_refcount = 0;
+ uint64_t indirect_vdev_count = 0;
+ uint64_t precise_vdev_count = 0;
+ uint64_t obsolete_counts_object_count = 0;
+ uint64_t obsolete_sm_count = 0;
+ uint64_t obsolete_counts_count = 0;
+ uint64_t scip_count = 0;
+ uint64_t obsolete_bpobj_count = 0;
+ int ret = 0;
+
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ if (scip->scip_next_mapping_object != 0) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ (void) printf("Condensing indirect vdev %llu: new mapping "
+ "object %llu, prev obsolete sm %llu\n",
+ (u_longlong_t)scip->scip_vdev,
+ (u_longlong_t)scip->scip_next_mapping_object,
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object);
+ if (scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm,
+ spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
+ (void) printf("\n");
+ space_map_close(prev_obsolete_sm);
+ }
+
+ scip_count += 2;
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (vic->vic_mapping_object != 0) {
+ ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
+ vd->vdev_removing);
+ indirect_vdev_count++;
+
+ if (vd->vdev_indirect_mapping->vim_havecounts) {
+ obsolete_counts_count++;
+ }
+ }
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
+ ASSERT(vic->vic_mapping_object != 0);
+ precise_vdev_count++;
+ }
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ ASSERT(vic->vic_mapping_object != 0);
+ obsolete_sm_count++;
+ }
+ }
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
+ &dr_feature_refcount);
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
+ &oc_feature_refcount);
+
+ if (dr_feature_refcount != indirect_vdev_count) {
+ ret = 1;
+ (void) printf("Number of indirect vdevs (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)indirect_vdev_count,
+ (u_longlong_t)dr_feature_refcount);
+ } else {
+ (void) printf("Verified device_removal feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)dr_feature_refcount);
+ }
+
+ if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ) == 0) {
+ obsolete_bpobj_count++;
+ }
+
+
+ obsolete_counts_object_count = precise_vdev_count;
+ obsolete_counts_object_count += obsolete_sm_count;
+ obsolete_counts_object_count += obsolete_counts_count;
+ obsolete_counts_object_count += scip_count;
+ obsolete_counts_object_count += obsolete_bpobj_count;
+ obsolete_counts_object_count += remap_deadlist_count;
+
+ if (oc_feature_refcount != obsolete_counts_object_count) {
+ ret = 1;
+ (void) printf("Number of obsolete counts objects (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)obsolete_counts_object_count,
+ (u_longlong_t)oc_feature_refcount);
+ (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
+ "ob:%llu rd:%llu\n",
+ (u_longlong_t)precise_vdev_count,
+ (u_longlong_t)obsolete_sm_count,
+ (u_longlong_t)obsolete_counts_count,
+ (u_longlong_t)scip_count,
+ (u_longlong_t)obsolete_bpobj_count,
+ (u_longlong_t)remap_deadlist_count);
+ } else {
+ (void) printf("Verified indirect_refcount feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)oc_feature_refcount);
+ }
+ return (ret);
+}
+
+static void
+zdb_set_skip_mmp(char *target)
+{
+ spa_t *spa;
+
+ /*
+ * Disable the activity check to allow examination of
+ * active pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL) {
+ spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+ int error = 0;
+ char *poolname, *bogus_name = NULL;
+
+ /* If the target is not a pool, the extract the pool name */
+ char *path_start = strchr(target, '/');
+ if (path_start != NULL) {
+ size_t poolname_len = path_start - target;
+ poolname = strndup(target, poolname_len);
+ } else {
+ poolname = target;
+ }
+
+ if (cfg == NULL) {
+ zdb_set_skip_mmp(poolname);
+ error = spa_get_stats(poolname, &cfg, NULL, 0);
+ if (error != 0) {
+ fatal("Tried to read config of pool \"%s\" but "
+ "spa_get_stats() failed with error %d\n",
+ poolname, error);
+ }
+ }
+
+ if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+ return (NULL);
+ fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+ error = spa_import(bogus_name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
+ ZFS_IMPORT_SKIP_MMP);
+ if (error != 0) {
+ fatal("Tried to import pool \"%s\" but spa_import() failed "
+ "with error %d\n", bogus_name, error);
+ }
+
+ if (new_path != NULL && path_start != NULL) {
+ if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+ if (path_start != NULL)
+ free(poolname);
+ return (NULL);
+ }
+ }
+
+ if (target != poolname)
+ free(poolname);
+
+ return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+ vdev_t *vcsec_vd;
+
+ /* the following fields are only used for printing progress */
+ uint64_t vcsec_entryid;
+ uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
+{
+ verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+ vdev_t *vd = vcsec->vcsec_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ ASSERT(sme->sme_type == SM_FREE);
+
+ if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu, space map entry %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vcsec->vcsec_entryid,
+ (longlong_t)vcsec->vcsec_num_entries);
+ }
+ vcsec->vcsec_entryid++;
+
+ /*
+ * See comment in checkpoint_sm_exclude_entry_cb()
+ */
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * The entries in the vdev_checkpoint_sm should be marked as
+ * allocated in the checkpointed state of the pool, therefore
+ * their respective ms_allocateable trees should not contain them.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+ for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * Since we don't allow device removal in a pool
+ * that has a checkpoint, we expect that all removed
+ * vdevs were removed from the pool before the
+ * checkpoint.
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ /*
+ * If the checkpoint space map doesn't exist, then nothing
+ * here is checkpointed so there's nothing to verify.
+ */
+ if (current_vd->vdev_top_zap == 0 ||
+ zap_contains(spa_meta_objset(current),
+ current_vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(current),
+ current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+ checkpoint_sm_obj, 0, current_vd->vdev_asize,
+ current_vd->vdev_ashift));
+
+ verify_checkpoint_sm_entry_cb_arg_t vcsec;
+ vcsec.vcsec_vd = ckpoint_vd;
+ vcsec.vcsec_entryid = 0;
+ vcsec.vcsec_num_entries =
+ space_map_length(checkpoint_sm) / sizeof (uint64_t);
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
+ verify_checkpoint_sm_entry_cb, &vcsec));
+ if (dump_opt['m'] > 3)
+ dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+
+ /*
+ * If we've added vdevs since we took the checkpoint, ensure
+ * that their checkpoint space maps are empty.
+ */
+ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+ for (uint64_t c = ckpoint_rvd->vdev_children;
+ c < current_rvd->vdev_children; c++) {
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+ ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+ load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+ for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+ vdev_t *current_vd = current_rvd->vdev_child[i];
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * See comment in verify_checkpoint_vdev_spacemaps()
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+ metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+ metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu of %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)current_vd->vdev_id,
+ (longlong_t)current_rvd->vdev_children,
+ (longlong_t)current_vd->vdev_ms[m]->ms_id,
+ (longlong_t)current_vd->vdev_ms_count);
+
+ /*
+ * We walk through the ms_allocatable trees that
+ * are loaded with the allocated blocks from the
+ * ms_sm spacemaps of the checkpoint. For each
+ * one of these ranges we ensure that none of them
+ * exists in the ms_allocatable trees of the
+ * current state which are loaded with the ranges
+ * that are currently free.
+ *
+ * This way we ensure that none of the blocks that
+ * are part of the checkpoint were freed by mistake.
+ */
+ range_tree_walk(ckpoint_msp->ms_allocatable,
+ (range_tree_func_t *)range_tree_verify_not_present,
+ current_msp->ms_allocatable);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+ ASSERT(!dump_opt['L']);
+
+ spa_t *checkpoint_spa;
+ char *checkpoint_pool;
+ nvlist_t *config = NULL;
+ int error = 0;
+
+ /*
+ * We import the checkpointed state of the pool (under a different
+ * name) so we can do verification on it against the current state
+ * of the pool.
+ */
+ checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+ NULL);
+ ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+ error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but spa_open() failed with "
+ "error %d\n", checkpoint_pool, error);
+ }
+
+ /*
+ * Ensure that ranges in the checkpoint space maps of each vdev
+ * are allocated according to the checkpointed state's metaslab
+ * space maps.
+ */
+ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Ensure that allocated ranges in the checkpoint's metaslab
+ * space maps remain allocated in the metaslab space maps of
+ * the current state.
+ */
+ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);