]> git.proxmox.com Git - mirror_zfs.git/blobdiff - cmd/zdb/zdb.c
OpenZFS 9166 - zfs storage pool checkpoint
[mirror_zfs.git] / cmd / zdb / zdb.c
index faac43c792aa05b38f24fce422df16fea40c5c46..d1e77cce7f58b043657c15fe12f84542e30943ab 100644 (file)
@@ -131,7 +131,7 @@ static void
 usage(void)
 {
        (void) fprintf(stderr,
-           "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
+           "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
            "[-I <inflight I/Os>]\n"
            "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
            "\t\t[<poolname> [<object> ...]]\n"
@@ -168,6 +168,8 @@ usage(void)
        (void) fprintf(stderr, "        -h pool history\n");
        (void) fprintf(stderr, "        -i intent logs\n");
        (void) fprintf(stderr, "        -l read label contents\n");
+       (void) fprintf(stderr, "        -k examine the checkpointed state "
+           "of the pool\n");
        (void) fprintf(stderr, "        -L disable leak tracking (do not "
            "load spacemaps)\n");
        (void) fprintf(stderr, "        -m metaslabs\n");
@@ -730,6 +732,22 @@ get_prev_obsolete_spacemap_refcount(spa_t *spa)
        return (0);
 }
 
+static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+       int refcount = 0;
+
+       if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+           zap_contains(spa_meta_objset(vd->vdev_spa),
+           vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+               refcount++;
+
+       for (uint64_t c = 0; c < vd->vdev_children; c++)
+               refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+       return (refcount);
+}
+
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
@@ -743,6 +761,7 @@ verify_spacemap_refcounts(spa_t *spa)
        actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
        actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
        actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+       actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 
        if (expected_refcount != actual_refcount) {
                (void) printf("space map refcount mismatch: expected %lld != "
@@ -816,8 +835,8 @@ static void
 dump_metaslab_stats(metaslab_t *msp)
 {
        char maxbuf[32];
-       range_tree_t *rt = msp->ms_tree;
-       avl_tree_t *t = &msp->ms_size_tree;
+       range_tree_t *rt = msp->ms_allocatable;
+       avl_tree_t *t = &msp->ms_allocatable_by_size;
        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
        /* max sure nicenum has enough space */
@@ -853,7 +872,7 @@ dump_metaslab(metaslab_t *msp)
                metaslab_load_wait(msp);
                if (!msp->ms_loaded) {
                        VERIFY0(metaslab_load(msp));
-                       range_tree_stat_verify(msp->ms_tree);
+                       range_tree_stat_verify(msp->ms_allocatable);
                }
                dump_metaslab_stats(msp);
                metaslab_unload(msp);
@@ -2420,6 +2439,8 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
                snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
                (void) printf("\trootbp = %s\n", blkbuf);
        }
+       (void) printf("\tcheckpoint_txg = %llu\n",
+           (u_longlong_t)ub->ub_checkpoint_txg);
        (void) printf("%s", footer ? footer : "");
 }
 
@@ -3129,6 +3150,7 @@ static const char *zdb_ot_extname[] = {
 typedef struct zdb_cb {
        zdb_blkstats_t  zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
        uint64_t        zcb_removing_size;
+       uint64_t        zcb_checkpoint_size;
        uint64_t        zcb_dedup_asize;
        uint64_t        zcb_dedup_blocks;
        uint64_t        zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
@@ -3229,7 +3251,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
        }
 
        VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-           refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+           refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
            bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
@@ -3388,7 +3410,7 @@ claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
        ASSERT(vdev_is_concrete(vd));
 
        VERIFY0(metaslab_claim_impl(vd, offset, size,
-           spa_first_txg(vd->vdev_spa)));
+           spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
@@ -3453,70 +3475,6 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
        spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
-/*
- * vm_idxp is an in-out parameter which (for indirect vdevs) is the
- * index in vim_entries that has the first entry in this metaslab.  On
- * return, it will be set to the first entry after this metaslab.
- */
-static void
-zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp)
-{
-       metaslab_group_t *mg = msp->ms_group;
-       vdev_t *vd = mg->mg_vd;
-       vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-
-       mutex_enter(&msp->ms_lock);
-       metaslab_unload(msp);
-
-       /*
-        * We don't want to spend the CPU manipulating the size-ordered
-        * tree, so clear the range_tree ops.
-        */
-       msp->ms_tree->rt_ops = NULL;
-
-       (void) fprintf(stderr,
-           "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
-           (longlong_t)vd->vdev_id,
-           (longlong_t)rvd->vdev_children,
-           (longlong_t)msp->ms_id,
-           (longlong_t)vd->vdev_ms_count);
-
-       /*
-        * For leak detection, we overload the metaslab ms_tree to
-        * contain allocated segments instead of free segments. As a
-        * result, we can't use the normal metaslab_load/unload
-        * interfaces.
-        */
-       if (vd->vdev_ops == &vdev_indirect_ops) {
-               vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-               for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
-                   (*vim_idxp)++) {
-                       vdev_indirect_mapping_entry_phys_t *vimep =
-                           &vim->vim_entries[*vim_idxp];
-                       uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
-                       uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
-                       ASSERT3U(ent_offset, >=, msp->ms_start);
-                       if (ent_offset >= msp->ms_start + msp->ms_size)
-                               break;
-
-                       /*
-                        * Mappings do not cross metaslab boundaries,
-                        * because we create them by walking the metaslabs.
-                        */
-                       ASSERT3U(ent_offset + ent_len, <=,
-                           msp->ms_start + msp->ms_size);
-                       range_tree_add(msp->ms_tree, ent_offset, ent_len);
-               }
-       } else if (msp->ms_sm != NULL) {
-               VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
-       }
-
-       if (!msp->ms_loaded) {
-               msp->ms_loaded = B_TRUE;
-       }
-       mutex_exit(&msp->ms_lock);
-}
-
 /* ARGSUSED */
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
@@ -3615,11 +3573,246 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
        ASSERT(error == ENOENT);
 }
 
+typedef struct checkpoint_sm_exclude_entry_arg {
+       vdev_t *cseea_vd;
+       uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+    void *arg)
+{
+       checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+       vdev_t *vd = cseea->cseea_vd;
+       metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+       uint64_t end = offset + size;
+
+       ASSERT(type == SM_FREE);
+
+       /*
+        * Since the vdev_checkpoint_sm exists in the vdev level
+        * and the ms_sm space maps exist in the metaslab level,
+        * an entry in the checkpoint space map could theoretically
+        * cross the boundaries of the metaslab that it belongs.
+        *
+        * In reality, because of the way that we populate and
+        * manipulate the checkpoint's space maps currently,
+        * there shouldn't be any entries that cross metaslabs.
+        * Hence the assertion below.
+        *
+        * That said, there is no fundamental requirement that
+        * the checkpoint's space map entries should not cross
+        * metaslab boundaries. So if needed we could add code
+        * that handles metaslab-crossing segments in the future.
+        */
+       VERIFY3U(offset, >=, ms->ms_start);
+       VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+       /*
+        * By removing the entry from the allocated segments we
+        * also verify that the entry is there to begin with.
+        */
+       mutex_enter(&ms->ms_lock);
+       range_tree_remove(ms->ms_allocatable, offset, size);
+       mutex_exit(&ms->ms_lock);
+
+       cseea->cseea_checkpoint_size += size;
+       return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+       spa_t *spa = vd->vdev_spa;
+       space_map_t *checkpoint_sm = NULL;
+       uint64_t checkpoint_sm_obj;
+
+       /*
+        * If there is no vdev_top_zap, we are in a pool whose
+        * version predates the pool checkpoint feature.
+        */
+       if (vd->vdev_top_zap == 0)
+               return;
+
+       /*
+        * If there is no reference of the vdev_checkpoint_sm in
+        * the vdev_top_zap, then one of the following scenarios
+        * is true:
+        *
+        * 1] There is no checkpoint
+        * 2] There is a checkpoint, but no checkpointed blocks
+        *    have been freed yet
+        * 3] The current vdev is indirect
+        *
+        * In these cases we return immediately.
+        */
+       if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+           VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+               return;
+
+       VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+           VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+           &checkpoint_sm_obj));
+
+       checkpoint_sm_exclude_entry_arg_t cseea;
+       cseea.cseea_vd = vd;
+       cseea.cseea_checkpoint_size = 0;
+
+       VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+           checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+       space_map_update(checkpoint_sm);
+
+       VERIFY0(space_map_iterate(checkpoint_sm,
+           checkpoint_sm_exclude_entry_cb, &cseea));
+       space_map_close(checkpoint_sm);
+
+       zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+               zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+       }
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *vd = rvd->vdev_child[i];
+
+               ASSERT3U(i, ==, vd->vdev_id);
+
+               if (vd->vdev_ops == &vdev_indirect_ops)
+                       continue;
+
+               for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+                       metaslab_t *msp = vd->vdev_ms[m];
+
+                       (void) fprintf(stderr,
+                           "\rloading concrete vdev %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)vd->vdev_id,
+                           (longlong_t)msp->ms_id,
+                           (longlong_t)vd->vdev_ms_count);
+
+                       mutex_enter(&msp->ms_lock);
+                       metaslab_unload(msp);
+
+                       /*
+                        * We don't want to spend the CPU manipulating the
+                        * size-ordered tree, so clear the range_tree ops.
+                        */
+                       msp->ms_allocatable->rt_ops = NULL;
+
+                       if (msp->ms_sm != NULL) {
+                               VERIFY0(space_map_load(msp->ms_sm,
+                                   msp->ms_allocatable, maptype));
+                       }
+                       if (!msp->ms_loaded)
+                               msp->ms_loaded = B_TRUE;
+                       mutex_exit(&msp->ms_lock);
+               }
+       }
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+    uint64_t *vim_idxp)
+{
+       vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+       mutex_enter(&msp->ms_lock);
+       metaslab_unload(msp);
+
+       /*
+        * We don't want to spend the CPU manipulating the
+        * size-ordered tree, so clear the range_tree ops.
+        */
+       msp->ms_allocatable->rt_ops = NULL;
+
+       for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+           (*vim_idxp)++) {
+               vdev_indirect_mapping_entry_phys_t *vimep =
+                   &vim->vim_entries[*vim_idxp];
+               uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+               uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+               ASSERT3U(ent_offset, >=, msp->ms_start);
+               if (ent_offset >= msp->ms_start + msp->ms_size)
+                       break;
+
+               /*
+                * Mappings do not cross metaslab boundaries,
+                * because we create them by walking the metaslabs.
+                */
+               ASSERT3U(ent_offset + ent_len, <=,
+                   msp->ms_start + msp->ms_size);
+               range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+       }
+
+       if (!msp->ms_loaded)
+               msp->ms_loaded = B_TRUE;
+       mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+
+               ASSERT3U(c, ==, vd->vdev_id);
+
+               if (vd->vdev_ops != &vdev_indirect_ops)
+                       continue;
+
+               /*
+                * Note: we don't check for mapping leaks on
+                * removing vdevs because their ms_allocatable's
+                * are used to look for leaks in allocated space.
+                */
+               zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+               /*
+                * Normally, indirect vdevs don't have any
+                * metaslabs.  We want to set them up for
+                * zio_claim().
+                */
+               VERIFY0(vdev_metaslab_init(vd, 0));
+
+               vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+               uint64_t vim_idx = 0;
+               for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+                       (void) fprintf(stderr,
+                           "\rloading indirect vdev %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)vd->vdev_id,
+                           (longlong_t)vd->vdev_ms[m]->ms_id,
+                           (longlong_t)vd->vdev_ms_count);
+
+                       load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+                           &vim_idx);
+               }
+               ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+       }
+}
+
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
        zcb->zcb_spa = spa;
-       uint64_t c;
 
        if (!dump_opt['L']) {
                dsl_pool_t *dp = spa->spa_dsl_pool;
@@ -3627,7 +3820,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 
                /*
                 * We are going to be changing the meaning of the metaslab's
-                * ms_tree.  Ensure that the allocator doesn't try to
+                * ms_allocatable.  Ensure that the allocator doesn't try to
                 * use the tree.
                 */
                spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
@@ -3637,38 +3830,37 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
                    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
                    UMEM_NOFAIL);
 
-               for (c = 0; c < rvd->vdev_children; c++) {
-                       vdev_t *vd = rvd->vdev_child[c];
-                       uint64_t vim_idx = 0;
-
-                       ASSERT3U(c, ==, vd->vdev_id);
-
-                       /*
-                        * Note: we don't check for mapping leaks on
-                        * removing vdevs because their ms_tree's are
-                        * used to look for leaks in allocated space.
-                        */
-                       if (vd->vdev_ops == &vdev_indirect_ops) {
-                               zcb->zcb_vd_obsolete_counts[c] =
-                                   zdb_load_obsolete_counts(vd);
+               /*
+                * For leak detection, we overload the ms_allocatable trees
+                * to contain allocated segments instead of free segments.
+                * As a result, we can't use the normal metaslab_load/unload
+                * interfaces.
+                */
+               zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+               load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
-                               /*
-                                * Normally, indirect vdevs don't have any
-                                * metaslabs.  We want to set them up for
-                                * zio_claim().
-                                */
-                               VERIFY0(vdev_metaslab_init(vd, 0));
-                       }
+               /*
+                * On load_concrete_ms_allocatable_trees() we loaded all the
+                * allocated entries from the ms_sm to the ms_allocatable for
+                * each metaslab. If the pool has a checkpoint or is in the
+                * middle of discarding a checkpoint, some of these blocks
+                * may have been freed but their ms_sm may not have been
+                * updated because they are referenced by the checkpoint. In
+                * order to avoid false-positives during leak-detection, we
+                * go through the vdev's checkpoint space map and exclude all
+                * its entries from their relevant ms_allocatable.
+                *
+                * We also aggregate the space held by the checkpoint and add
+                * it to zcb_checkpoint_size.
+                *
+                * Note that at this point we are also verifying that all the
+                * entries on the checkpoint_sm are marked as allocated in
+                * the ms_sm of their relevant metaslab.
+                * [see comment in checkpoint_sm_exclude_entry_cb()]
+                */
+               zdb_leak_init_exclude_checkpoint(spa, zcb);
 
-                       for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
-                               zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
-                       }
-                       if (vd->vdev_ops == &vdev_indirect_ops) {
-                               ASSERT3U(vim_idx, ==,
-                                   vdev_indirect_mapping_num_entries(
-                                   vd->vdev_indirect_mapping));
-                       }
-               }
+               /* for cleaner progress output */
                (void) fprintf(stderr, "\n");
 
                if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
@@ -3677,12 +3869,16 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
                        (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
                            increment_indirect_mapping_cb, zcb, NULL);
                }
+       } else {
+               /*
+                * If leak tracing is disabled, we still need to consider
+                * any checkpointed space in our space verification.
+                */
+               zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
        }
 
        spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
        zdb_ddt_leak_init(spa, zcb);
-
        spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
@@ -3709,7 +3905,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
                for (uint64_t inner_offset = 0;
                    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
                    inner_offset += 1 << vd->vdev_ashift) {
-                       if (range_tree_contains(msp->ms_tree,
+                       if (range_tree_contains(msp->ms_allocatable,
                            offset + inner_offset, 1 << vd->vdev_ashift)) {
                                obsolete_bytes += 1 << vd->vdev_ashift;
                        }
@@ -3775,23 +3971,23 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
                                ASSERT3P(mg, ==, msp->ms_group);
 
                                /*
-                                * The ms_tree has been overloaded to
-                                * contain allocated segments. Now that we
-                                * finished traversing all blocks, any
-                                * block that remains in the ms_tree
+                                * ms_allocatable has been overloaded
+                                * to contain allocated segments. Now that
+                                * we finished traversing all blocks, any
+                                * block that remains in the ms_allocatable
                                 * represents an allocated block that we
                                 * did not claim during the traversal.
                                 * Claimed blocks would have been removed
-                                * from the ms_tree.  For indirect vdevs,
-                                * space remaining in the tree represents
-                                * parts of the mapping that are not
-                                * referenced, which is not a bug.
+                                * from the ms_allocatable.  For indirect
+                                * vdevs, space remaining in the tree
+                                * represents parts of the mapping that are
+                                * not referenced, which is not a bug.
                                 */
                                if (vd->vdev_ops == &vdev_indirect_ops) {
-                                       range_tree_vacate(msp->ms_tree,
+                                       range_tree_vacate(msp->ms_allocatable,
                                            NULL, NULL);
                                } else {
-                                       range_tree_vacate(msp->ms_tree,
+                                       range_tree_vacate(msp->ms_allocatable,
                                            zdb_leak, vd);
                                }
 
@@ -3923,7 +4119,7 @@ dump_block_stats(spa_t *spa)
 
        total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
        total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
-           zcb.zcb_removing_size;
+           zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
 
        if (total_found == total_alloc) {
                if (!dump_opt['L'])
@@ -4332,6 +4528,390 @@ verify_device_removal_feature_counts(spa_t *spa)
        return (ret);
 }
 
+#define        BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+       int error = 0;
+       char *poolname, *bogus_name = NULL;
+
+       /* If the target is not a pool, the extract the pool name */
+       char *path_start = strchr(target, '/');
+       if (path_start != NULL) {
+               size_t poolname_len = path_start - target;
+               poolname = strndup(target, poolname_len);
+       } else {
+               poolname = target;
+       }
+
+       if (cfg == NULL) {
+               error = spa_get_stats(poolname, &cfg, NULL, 0);
+               if (error != 0) {
+                       fatal("Tried to read config of pool \"%s\" but "
+                           "spa_get_stats() failed with error %d\n",
+                           poolname, error);
+               }
+       }
+
+       if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+               return (NULL);
+       fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+       error = spa_import(bogus_name, cfg, NULL,
+           ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT);
+       if (error != 0) {
+               fatal("Tried to import pool \"%s\" but spa_import() failed "
+                   "with error %d\n", bogus_name, error);
+       }
+
+       if (new_path != NULL && path_start != NULL) {
+               if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+                       if (path_start != NULL)
+                               free(poolname);
+                       return (NULL);
+               }
+       }
+
+       if (target != poolname)
+               free(poolname);
+
+       return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+       vdev_t *vcsec_vd;
+
+       /* the following fields are only used for printing progress */
+       uint64_t vcsec_entryid;
+       uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define        ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+    void *arg)
+{
+       verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+       vdev_t *vd = vcsec->vcsec_vd;
+       metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+       uint64_t end = offset + size;
+
+       ASSERT(type == SM_FREE);
+
+       if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+               (void) fprintf(stderr,
+                   "\rverifying vdev %llu, space map entry %llu of %llu ...",
+                   (longlong_t)vd->vdev_id,
+                   (longlong_t)vcsec->vcsec_entryid,
+                   (longlong_t)vcsec->vcsec_num_entries);
+       }
+       vcsec->vcsec_entryid++;
+
+       /*
+        * See comment in checkpoint_sm_exclude_entry_cb()
+        */
+       VERIFY3U(offset, >=, ms->ms_start);
+       VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+       /*
+        * The entries in the vdev_checkpoint_sm should be marked as
+        * allocated in the checkpointed state of the pool, therefore
+        * their respective ms_allocateable trees should not contain them.
+        */
+       mutex_enter(&ms->ms_lock);
+       range_tree_verify(ms->ms_allocatable, offset, size);
+       mutex_exit(&ms->ms_lock);
+
+       return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+       vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+       vdev_t *current_rvd = current->spa_root_vdev;
+
+       load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+       for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+               vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+               vdev_t *current_vd = current_rvd->vdev_child[c];
+
+               space_map_t *checkpoint_sm = NULL;
+               uint64_t checkpoint_sm_obj;
+
+               if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+                       /*
+                        * Since we don't allow device removal in a pool
+                        * that has a checkpoint, we expect that all removed
+                        * vdevs were removed from the pool before the
+                        * checkpoint.
+                        */
+                       ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+                       continue;
+               }
+
+               /*
+                * If the checkpoint space map doesn't exist, then nothing
+                * here is checkpointed so there's nothing to verify.
+                */
+               if (current_vd->vdev_top_zap == 0 ||
+                   zap_contains(spa_meta_objset(current),
+                   current_vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+                       continue;
+
+               VERIFY0(zap_lookup(spa_meta_objset(current),
+                   current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+                   sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+               VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+                   checkpoint_sm_obj, 0, current_vd->vdev_asize,
+                   current_vd->vdev_ashift));
+               space_map_update(checkpoint_sm);
+
+               verify_checkpoint_sm_entry_cb_arg_t vcsec;
+               vcsec.vcsec_vd = ckpoint_vd;
+               vcsec.vcsec_entryid = 0;
+               vcsec.vcsec_num_entries =
+                   space_map_length(checkpoint_sm) / sizeof (uint64_t);
+               VERIFY0(space_map_iterate(checkpoint_sm,
+                   verify_checkpoint_sm_entry_cb, &vcsec));
+               dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+               space_map_close(checkpoint_sm);
+       }
+
+       /*
+        * If we've added vdevs since we took the checkpoint, ensure
+        * that their checkpoint space maps are empty.
+        */
+       if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+               for (uint64_t c = ckpoint_rvd->vdev_children;
+                   c < current_rvd->vdev_children; c++) {
+                       vdev_t *current_vd = current_rvd->vdev_child[c];
+                       ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+               }
+       }
+
+       /* for cleaner progress output */
+       (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+       vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+       vdev_t *current_rvd = current->spa_root_vdev;
+
+       load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+       load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+       for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+               vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+               vdev_t *current_vd = current_rvd->vdev_child[i];
+
+               if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+                       /*
+                        * See comment in verify_checkpoint_vdev_spacemaps()
+                        */
+                       ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+                       continue;
+               }
+
+               for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+                       metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+                       metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+                       (void) fprintf(stderr,
+                           "\rverifying vdev %llu of %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)current_vd->vdev_id,
+                           (longlong_t)current_rvd->vdev_children,
+                           (longlong_t)current_vd->vdev_ms[m]->ms_id,
+                           (longlong_t)current_vd->vdev_ms_count);
+
+                       /*
+                        * We walk through the ms_allocatable trees that
+                        * are loaded with the allocated blocks from the
+                        * ms_sm spacemaps of the checkpoint. For each
+                        * one of these ranges we ensure that none of them
+                        * exists in the ms_allocatable trees of the
+                        * current state which are loaded with the ranges
+                        * that are currently free.
+                        *
+                        * This way we ensure that none of the blocks that
+                        * are part of the checkpoint were freed by mistake.
+                        */
+                       range_tree_walk(ckpoint_msp->ms_allocatable,
+                           (range_tree_func_t *)range_tree_verify,
+                           current_msp->ms_allocatable);
+               }
+       }
+
+       /* for cleaner progress output */
+       (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+       spa_t *checkpoint_spa;
+       char *checkpoint_pool;
+       nvlist_t *config = NULL;
+       int error = 0;
+
+       /*
+        * We import the checkpointed state of the pool (under a different
+        * name) so we can do verification on it against the current state
+        * of the pool.
+        */
+       checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+           NULL);
+       ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+       error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+       if (error != 0) {
+               fatal("Tried to open pool \"%s\" but spa_open() failed with "
+                   "error %d\n", checkpoint_pool, error);
+       }
+
+       /*
+        * Ensure that ranges in the checkpoint space maps of each vdev
+        * are allocated according to the checkpointed state's metaslab
+        * space maps.
+        */
+       verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+       /*
+        * Ensure that allocated ranges in the checkpoint's metaslab
+        * space maps remain allocated in the metaslab space maps of
+        * the current state.
+        */
+       verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+       /*
+        * Once we are done, we get rid of the checkpointed state.
+        */
+       spa_close(checkpoint_spa, FTAG);
+       free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *vd = rvd->vdev_child[i];
+
+               space_map_t *checkpoint_sm = NULL;
+               uint64_t checkpoint_sm_obj;
+
+               if (vd->vdev_top_zap == 0)
+                       continue;
+
+               if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+                       continue;
+
+               VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+                   sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+               VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+                   checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+               space_map_update(checkpoint_sm);
+               dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+               space_map_close(checkpoint_sm);
+       }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+       uberblock_t checkpoint;
+       int error;
+
+       if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (0);
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+           sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+       if (error == ENOENT) {
+               /*
+                * If the feature is active but the uberblock is missing
+                * then we must be in the middle of discarding the
+                * checkpoint.
+                */
+               (void) printf("\nPartially discarded checkpoint "
+                   "state found:\n");
+               dump_leftover_checkpoint_blocks(spa);
+               return (0);
+       } else if (error != 0) {
+               (void) printf("lookup error %d when looking for "
+                   "checkpointed uberblock in MOS\n", error);
+               return (error);
+       }
+       dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+       if (checkpoint.ub_checkpoint_txg == 0) {
+               (void) printf("\nub_checkpoint_txg not set in checkpointed "
+                   "uberblock\n");
+               error = 3;
+       }
+
+       if (error == 0)
+               verify_checkpoint_blocks(spa);
+
+       return (error);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
@@ -4435,6 +5015,9 @@ dump_zpool(spa_t *spa)
        if (dump_opt['h'])
                dump_history(spa);
 
+       if (rc == 0 && !dump_opt['L'])
+               rc = verify_checkpoint(spa);
+
        if (rc != 0) {
                dump_debug_buffer();
                exit(rc);
@@ -4879,6 +5462,7 @@ main(int argc, char **argv)
        int rewind = ZPOOL_NEVER_REWIND;
        char *spa_config_path_env;
        boolean_t target_is_spa = B_TRUE;
+       nvlist_t *cfg = NULL;
 
        (void) setrlimit(RLIMIT_NOFILE, &rl);
        (void) enable_extended_FILE_stdio(-1, -1);
@@ -4895,7 +5479,7 @@ main(int argc, char **argv)
                spa_config_path = spa_config_path_env;
 
        while ((c = getopt(argc, argv,
-           "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+           "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
                switch (c) {
                case 'b':
                case 'c':
@@ -4920,6 +5504,7 @@ main(int argc, char **argv)
                case 'A':
                case 'e':
                case 'F':
+               case 'k':
                case 'L':
                case 'P':
                case 'q':
@@ -5029,7 +5614,7 @@ main(int argc, char **argv)
                verbose = MAX(verbose, 1);
 
        for (c = 0; c < 256; c++) {
-               if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
+               if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
                        dump_opt[c] = 1;
                if (dump_opt[c])
                        dump_opt[c] += verbose;
@@ -5081,6 +5666,17 @@ main(int argc, char **argv)
        error = 0;
        target = argv[0];
 
+       char *checkpoint_pool = NULL;
+       char *checkpoint_target = NULL;
+       if (dump_opt['k']) {
+               checkpoint_pool = import_checkpointed_state(target, cfg,
+                   &checkpoint_target);
+
+               if (checkpoint_target != NULL)
+                       target = checkpoint_target;
+
+       }
+
        if (strpbrk(target, "/@") != NULL) {
                size_t targetlen;
 
@@ -5097,7 +5693,6 @@ main(int argc, char **argv)
 
        if (dump_opt['e']) {
                importargs_t args = { 0 };
-               nvlist_t *cfg = NULL;
 
                args.paths = nsearch;
                args.path = searchdirs;
@@ -5121,6 +5716,7 @@ main(int argc, char **argv)
                                (void) printf("\nConfiguration for import:\n");
                                dump_nvlist(cfg, 8);
                        }
+
                        error = spa_import(target_pool, cfg, NULL,
                            flags | ZFS_IMPORT_SKIP_MMP);
                }
@@ -5130,7 +5726,18 @@ main(int argc, char **argv)
                free(target_pool);
 
        if (error == 0) {
-               if (target_is_spa || dump_opt['R']) {
+               if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+                       ASSERT(checkpoint_pool != NULL);
+                       ASSERT(checkpoint_target == NULL);
+
+                       error = spa_open(checkpoint_pool, &spa, FTAG);
+                       if (error != 0) {
+                               fatal("Tried to open pool \"%s\" but "
+                                   "spa_open() failed with error %d\n",
+                                   checkpoint_pool, error);
+                       }
+
+               } else if (target_is_spa || dump_opt['R']) {
                        /*
                         * Disable the activity check to allow examination of
                         * active pools.
@@ -5216,6 +5823,12 @@ main(int argc, char **argv)
                        zdb_read_block(argv[i], spa);
        }
 
+       if (dump_opt['k']) {
+               free(checkpoint_pool);
+               if (!target_is_spa)
+                       free(checkpoint_target);
+       }
+
        if (os != NULL)
                close_objset(os, FTAG);
        else