+static void
+zdb_set_skip_mmp(char *target)
+{
+ spa_t *spa;
+
+ /*
+ * Disable the activity check to allow examination of
+ * active pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL) {
+ spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+ int error = 0;
+ char *poolname, *bogus_name = NULL;
+
+ /* If the target is not a pool, the extract the pool name */
+ char *path_start = strchr(target, '/');
+ if (path_start != NULL) {
+ size_t poolname_len = path_start - target;
+ poolname = strndup(target, poolname_len);
+ } else {
+ poolname = target;
+ }
+
+ if (cfg == NULL) {
+ zdb_set_skip_mmp(poolname);
+ error = spa_get_stats(poolname, &cfg, NULL, 0);
+ if (error != 0) {
+ fatal("Tried to read config of pool \"%s\" but "
+ "spa_get_stats() failed with error %d\n",
+ poolname, error);
+ }
+ }
+
+ if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+ return (NULL);
+ fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+ error = spa_import(bogus_name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
+ ZFS_IMPORT_SKIP_MMP);
+ if (error != 0) {
+ fatal("Tried to import pool \"%s\" but spa_import() failed "
+ "with error %d\n", bogus_name, error);
+ }
+
+ if (new_path != NULL && path_start != NULL) {
+ if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+ if (path_start != NULL)
+ free(poolname);
+ return (NULL);
+ }
+ }
+
+ if (target != poolname)
+ free(poolname);
+
+ return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+ vdev_t *vcsec_vd;
+
+ /* the following fields are only used for printing progress */
+ uint64_t vcsec_entryid;
+ uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
+{
+ verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+ vdev_t *vd = vcsec->vcsec_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ ASSERT(sme->sme_type == SM_FREE);
+
+ if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu, space map entry %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vcsec->vcsec_entryid,
+ (longlong_t)vcsec->vcsec_num_entries);
+ }
+ vcsec->vcsec_entryid++;
+
+ /*
+ * See comment in checkpoint_sm_exclude_entry_cb()
+ */
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * The entries in the vdev_checkpoint_sm should be marked as
+ * allocated in the checkpointed state of the pool, therefore
+ * their respective ms_allocateable trees should not contain them.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+ for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * Since we don't allow device removal in a pool
+ * that has a checkpoint, we expect that all removed
+ * vdevs were removed from the pool before the
+ * checkpoint.
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ /*
+ * If the checkpoint space map doesn't exist, then nothing
+ * here is checkpointed so there's nothing to verify.
+ */
+ if (current_vd->vdev_top_zap == 0 ||
+ zap_contains(spa_meta_objset(current),
+ current_vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(current),
+ current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+ checkpoint_sm_obj, 0, current_vd->vdev_asize,
+ current_vd->vdev_ashift));
+
+ verify_checkpoint_sm_entry_cb_arg_t vcsec;
+ vcsec.vcsec_vd = ckpoint_vd;
+ vcsec.vcsec_entryid = 0;
+ vcsec.vcsec_num_entries =
+ space_map_length(checkpoint_sm) / sizeof (uint64_t);
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
+ verify_checkpoint_sm_entry_cb, &vcsec));
+ if (dump_opt['m'] > 3)
+ dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+
+ /*
+ * If we've added vdevs since we took the checkpoint, ensure
+ * that their checkpoint space maps are empty.
+ */
+ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+ for (uint64_t c = ckpoint_rvd->vdev_children;
+ c < current_rvd->vdev_children; c++) {
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+ ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+ load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+ for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+ vdev_t *current_vd = current_rvd->vdev_child[i];
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * See comment in verify_checkpoint_vdev_spacemaps()
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+ metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+ metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu of %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)current_vd->vdev_id,
+ (longlong_t)current_rvd->vdev_children,
+ (longlong_t)current_vd->vdev_ms[m]->ms_id,
+ (longlong_t)current_vd->vdev_ms_count);
+
+ /*
+ * We walk through the ms_allocatable trees that
+ * are loaded with the allocated blocks from the
+ * ms_sm spacemaps of the checkpoint. For each
+ * one of these ranges we ensure that none of them
+ * exists in the ms_allocatable trees of the
+ * current state which are loaded with the ranges
+ * that are currently free.
+ *
+ * This way we ensure that none of the blocks that
+ * are part of the checkpoint were freed by mistake.
+ */
+ range_tree_walk(ckpoint_msp->ms_allocatable,
+ (range_tree_func_t *)range_tree_verify_not_present,
+ current_msp->ms_allocatable);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+ ASSERT(!dump_opt['L']);
+
+ spa_t *checkpoint_spa;
+ char *checkpoint_pool;
+ nvlist_t *config = NULL;
+ int error = 0;
+
+ /*
+ * We import the checkpointed state of the pool (under a different
+ * name) so we can do verification on it against the current state
+ * of the pool.
+ */
+ checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+ NULL);
+ ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+ error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but spa_open() failed with "
+ "error %d\n", checkpoint_pool, error);
+ }
+
+ /*
+ * Ensure that ranges in the checkpoint space maps of each vdev
+ * are allocated according to the checkpointed state's metaslab
+ * space maps.
+ */
+ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Ensure that allocated ranges in the checkpoint's metaslab
+ * space maps remain allocated in the metaslab space maps of
+ * the current state.
+ */
+ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Once we are done, we get rid of the checkpointed state.
+ */
+ spa_close(checkpoint_spa, FTAG);
+ free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (vd->vdev_top_zap == 0)
+ continue;
+
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (0);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT && !dump_opt['L']) {
+ /*
+ * If the feature is active but the uberblock is missing
+ * then we must be in the middle of discarding the
+ * checkpoint.
+ */
+ (void) printf("\nPartially discarded checkpoint "
+ "state found:\n");
+ if (dump_opt['m'] > 3)
+ dump_leftover_checkpoint_blocks(spa);
+ return (0);
+ } else if (error != 0) {
+ (void) printf("lookup error %d when looking for "
+ "checkpointed uberblock in MOS\n", error);
+ return (error);
+ }
+ dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+ if (checkpoint.ub_checkpoint_txg == 0) {
+ (void) printf("\nub_checkpoint_txg not set in checkpointed "
+ "uberblock\n");
+ error = 3;
+ }
+
+ if (error == 0 && !dump_opt['L'])
+ verify_checkpoint_blocks(spa);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
+{
+ for (uint64_t i = start; i < size; i++) {
+ (void) printf("MOS object %llu referenced but not allocated\n",
+ (u_longlong_t)i);
+ }
+}
+
+static void
+mos_obj_refd(uint64_t obj)
+{
+ if (obj != 0 && mos_refd_objs != NULL)
+ range_tree_add(mos_refd_objs, obj, 1);
+}
+
+/*
+ * Call on a MOS object that may already have been referenced.
+ */
+static void
+mos_obj_refd_multiple(uint64_t obj)
+{
+ if (obj != 0 && mos_refd_objs != NULL &&
+ !range_tree_contains(mos_refd_objs, obj, 1))
+ range_tree_add(mos_refd_objs, obj, 1);
+}
+
+static void
+mos_leak_vdev(vdev_t *vd)
+{
+ mos_obj_refd(vd->vdev_dtl_object);
+ mos_obj_refd(vd->vdev_ms_array);
+ mos_obj_refd(vd->vdev_top_zap);
+ mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
+ mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
+ mos_obj_refd(vd->vdev_leaf_zap);
+ if (vd->vdev_checkpoint_sm != NULL)
+ mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
+ if (vd->vdev_indirect_mapping != NULL) {
+ mos_obj_refd(vd->vdev_indirect_mapping->
+ vim_phys->vimp_counts_object);
+ }
+ if (vd->vdev_obsolete_sm != NULL)
+ mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *ms = vd->vdev_ms[m];
+ mos_obj_refd(space_map_object(ms->ms_sm));
+ }
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ mos_leak_vdev(vd->vdev_child[c]);
+ }
+}
+
+static int
+dump_mos_leaks(spa_t *spa)
+{
+ int rv = 0;
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ /* Visit and mark all referenced objects in the MOS */
+
+ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
+ mos_obj_refd(spa->spa_pool_props_object);
+ mos_obj_refd(spa->spa_config_object);
+ mos_obj_refd(spa->spa_ddt_stat_object);
+ mos_obj_refd(spa->spa_feat_desc_obj);
+ mos_obj_refd(spa->spa_feat_enabled_txg_obj);
+ mos_obj_refd(spa->spa_feat_for_read_obj);
+ mos_obj_refd(spa->spa_feat_for_write_obj);
+ mos_obj_refd(spa->spa_history);
+ mos_obj_refd(spa->spa_errlog_last);
+ mos_obj_refd(spa->spa_errlog_scrub);
+ mos_obj_refd(spa->spa_all_vdev_zaps);
+ mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
+ mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
+ mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
+ bpobj_count_refd(&spa->spa_deferred_bpobj);
+ mos_obj_refd(dp->dp_empty_bpobj);
+ bpobj_count_refd(&dp->dp_obsolete_bpobj);
+ bpobj_count_refd(&dp->dp_free_bpobj);
+ mos_obj_refd(spa->spa_l2cache.sav_object);
+ mos_obj_refd(spa->spa_spares.sav_object);
+
+ mos_obj_refd(spa->spa_condensing_indirect_phys.
+ scip_next_mapping_object);
+ mos_obj_refd(spa->spa_condensing_indirect_phys.
+ scip_prev_obsolete_sm_object);
+ if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
+ vdev_indirect_mapping_t *vim =
+ vdev_indirect_mapping_open(mos,
+ spa->spa_condensing_indirect_phys.scip_next_mapping_object);
+ mos_obj_refd(vim->vim_phys->vimp_counts_object);
+ vdev_indirect_mapping_close(vim);
+ }
+
+ if (dp->dp_origin_snap != NULL) {
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
+ FTAG, &ds));
+ count_ds_mos_objects(ds);
+ dump_deadlist(&ds->ds_deadlist);
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+
+ count_ds_mos_objects(dp->dp_origin_snap);
+ dump_deadlist(&dp->dp_origin_snap->ds_deadlist);
+ }
+ count_dir_mos_objects(dp->dp_mos_dir);
+ if (dp->dp_free_dir != NULL)
+ count_dir_mos_objects(dp->dp_free_dir);
+ if (dp->dp_leak_dir != NULL)
+ count_dir_mos_objects(dp->dp_leak_dir);
+
+ mos_leak_vdev(spa->spa_root_vdev);
+
+ for (uint64_t class = 0; class < DDT_CLASSES; class++) {
+ for (uint64_t type = 0; type < DDT_TYPES; type++) {
+ for (uint64_t cksum = 0;
+ cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
+ ddt_t *ddt = spa->spa_ddt[cksum];
+ mos_obj_refd(ddt->ddt_object[type][class]);
+ }
+ }
+ }
+
+ /*
+ * Visit all allocated objects and make sure they are referenced.
+ */
+ uint64_t object = 0;
+ while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
+ if (range_tree_contains(mos_refd_objs, object, 1)) {
+ range_tree_remove(mos_refd_objs, object, 1);
+ } else {
+ dmu_object_info_t doi;
+ const char *name;
+ dmu_object_info(mos, object, &doi);
+ if (doi.doi_type & DMU_OT_NEWTYPE) {
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(doi.doi_type);
+ name = dmu_ot_byteswap[bswap].ob_name;
+ } else {
+ name = dmu_ot[doi.doi_type].ot_name;
+ }
+
+ (void) printf("MOS object %llu (%s) leaked\n",
+ (u_longlong_t)object, name);
+ rv = 2;
+ }
+ }
+ (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
+ if (!range_tree_is_empty(mos_refd_objs))
+ rv = 2;
+ range_tree_vacate(mos_refd_objs, NULL, NULL);
+ range_tree_destroy(mos_refd_objs);
+ return (rv);
+}
+