]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Extend zdb to print inconsistencies in livelists and metaslabs
authorMatthew Ahrens <matthew.ahrens@delphix.com>
Wed, 15 Jul 2020 00:51:05 +0000 (17:51 -0700)
committerGitHub <noreply@github.com>
Wed, 15 Jul 2020 00:51:05 +0000 (17:51 -0700)
Livelists and spacemaps are data structures that are logs of allocations
and frees.  Livelists entries are block pointers (blkptr_t). Spacemaps
entries are ranges of numbers, most often used as to track
allocated/freed regions of metaslabs/vdevs.

These data structures can become self-inconsistent, for example if a
block or range can be "double allocated" (two allocation records without
an intervening free) or "double freed" (two free records without an
intervening allocation).

ZDB (as well as zfs running in the kernel) can detect these
inconsistencies when loading livelists and metaslab.  However, it
generally halts processing when the error is detected.

When analyzing an on-disk problem, we often want to know the entire set
of inconsistencies, which is not possible with the current behavior.
This commit adds a new flag, `zdb -y`, which analyzes the livelist and
metaslab data structures and displays all of their inconsistencies.
Note that this is different from the leak detection performed by
`zdb -b`, which checks for inconsistencies between the spacemaps and the
tree of block pointers, but assumes the spacemaps are self-consistent.

The specific checks added are:

Verify livelists by iterating through each sublivelists and:
- report leftover FREEs
- report double ALLOCs and double FREEs
- record leftover ALLOCs together with their TXG [see Cross Check]

Verify spacemaps by iterating over each metaslab and:
- iterate over spacemap and then the metaslab's entries in the
  spacemap log, then report any double FREEs and double ALLOCs

Verify that livelists are consistenet with spacemaps.  The space
referenced by livelists (after using the FREE's to cancel out
corresponding ALLOCs) should be allocated, according to the spacemaps.

Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-66031
Closes #10515

cmd/zdb/zdb.c
cmd/ztest/ztest.c
include/sys/metaslab.h
include/sys/space_map.h
man/man8/zdb.8
module/zfs/metaslab.c
module/zfs/space_map.c
tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh

index a329e4a8337e0c0cbb2e5abdf89662899ce21905..59b17132fdac14cb3766ba0fde8a66a4120eef11 100644 (file)
@@ -69,6 +69,7 @@
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
+#include <sys/btree.h>
 #include <zfs_comutil.h>
 
 #include <libnvpair.h>
@@ -151,6 +152,571 @@ static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
     boolean_t);
 static void mos_obj_refd(uint64_t);
 static void mos_obj_refd_multiple(uint64_t);
+static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
+    dmu_tx_t *tx);
+
+typedef struct sublivelist_verify {
+       /* all ALLOC'd blkptr_t in one sub-livelist */
+       zfs_btree_t sv_all_allocs;
+
+       /* all FREE'd blkptr_t in one sub-livelist */
+       zfs_btree_t sv_all_frees;
+
+       /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
+       zfs_btree_t sv_pair;
+
+       /* ALLOC's without a matching FREE, accumulates across sub-livelists */
+       zfs_btree_t sv_leftover;
+} sublivelist_verify_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+       const blkptr_t *l = larg;
+       const blkptr_t *r = rarg;
+
+       /* Sort them according to dva[0] */
+       uint64_t l_dva0_vdev, r_dva0_vdev;
+       l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+       r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+       if (l_dva0_vdev < r_dva0_vdev)
+               return (-1);
+       else if (l_dva0_vdev > r_dva0_vdev)
+               return (+1);
+
+       /* if vdevs are equal, sort by offsets. */
+       uint64_t l_dva0_offset;
+       uint64_t r_dva0_offset;
+       l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+       r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+       if (l_dva0_offset < r_dva0_offset) {
+               return (-1);
+       } else if (l_dva0_offset > r_dva0_offset) {
+               return (+1);
+       }
+
+       /*
+        * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
+        * it's possible the offsets are equal. In that case, sort by txg
+        */
+       if (l->blk_birth < r->blk_birth) {
+               return (-1);
+       } else if (l->blk_birth > r->blk_birth) {
+               return (+1);
+       }
+       return (0);
+}
+
+typedef struct sublivelist_verify_block {
+       dva_t svb_dva;
+
+       /*
+        * We need this to check if the block marked as allocated
+        * in the livelist was freed (and potentially reallocated)
+        * in the metaslab spacemaps at a later TXG.
+        */
+       uint64_t svb_allocated_txg;
+} sublivelist_verify_block_t;
+
+static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+
+static int
+sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
+    dmu_tx_t *tx)
+{
+       ASSERT3P(tx, ==, NULL);
+       struct sublivelist_verify *sv = arg;
+       char blkbuf[BP_SPRINTF_LEN];
+       zfs_btree_index_t where;
+       if (free) {
+               zfs_btree_add(&sv->sv_pair, bp);
+               /* Check if the FREE is a duplicate */
+               if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
+                       snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+                           free);
+                       (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+               } else {
+                       zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+               }
+       } else {
+               /* Check if the ALLOC has been freed */
+               if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
+                       zfs_btree_remove_idx(&sv->sv_pair, &where);
+               } else {
+                       for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+                               if (DVA_IS_EMPTY(&bp->blk_dva[i]))
+                                       break;
+                               sublivelist_verify_block_t svb = {
+                                   .svb_dva = bp->blk_dva[i],
+                                   .svb_allocated_txg = bp->blk_birth
+                               };
+
+                               if (zfs_btree_find(&sv->sv_leftover, &svb,
+                                   &where) == NULL) {
+                                       zfs_btree_add_idx(&sv->sv_leftover,
+                                           &svb, &where);
+                               }
+                       }
+               }
+               /* Check if the ALLOC is a duplicate */
+               if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
+                       snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+                           free);
+                       (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
+               } else {
+                       zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+               }
+       }
+       return (0);
+}
+
+static int
+sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
+{
+       int err;
+       char blkbuf[BP_SPRINTF_LEN];
+       struct sublivelist_verify *sv = args;
+
+       zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
+           sizeof (blkptr_t));
+
+       zfs_btree_create(&sv->sv_all_frees, livelist_compare,
+           sizeof (blkptr_t));
+
+       zfs_btree_create(&sv->sv_pair, livelist_compare,
+           sizeof (blkptr_t));
+
+       err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
+           sv, NULL);
+
+       zfs_btree_clear(&sv->sv_all_allocs);
+       zfs_btree_destroy(&sv->sv_all_allocs);
+
+       zfs_btree_clear(&sv->sv_all_frees);
+       zfs_btree_destroy(&sv->sv_all_frees);
+
+       blkptr_t *e;
+       zfs_btree_index_t *cookie = NULL;
+       while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
+               snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
+               (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+       }
+       zfs_btree_destroy(&sv->sv_pair);
+
+       return (err);
+}
+
+static int
+livelist_block_compare(const void *larg, const void *rarg)
+{
+       const sublivelist_verify_block_t *l = larg;
+       const sublivelist_verify_block_t *r = rarg;
+
+       if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
+               return (-1);
+       else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
+               return (+1);
+
+       if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
+               return (-1);
+       else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
+               return (+1);
+
+       if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
+               return (-1);
+       else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
+               return (+1);
+
+       return (0);
+}
+
+/*
+ * Check for errors in a livelist while tracking all unfreed ALLOCs in the
+ * sublivelist_verify_t: sv->sv_leftover
+ */
+static void
+livelist_verify(dsl_deadlist_t *dl, void *arg)
+{
+       sublivelist_verify_t *sv = arg;
+       dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
+}
+
+/*
+ * Check for errors in the livelist entry and discard the intermediary
+ * data structures
+ */
+/* ARGSUSED */
+static int
+sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
+{
+       sublivelist_verify_t sv;
+       zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+           sizeof (sublivelist_verify_block_t));
+       int err = sublivelist_verify_func(&sv, dle);
+       zfs_btree_clear(&sv.sv_leftover);
+       zfs_btree_destroy(&sv.sv_leftover);
+       return (err);
+}
+
+typedef struct metaslab_verify {
+       /*
+        * Tree containing all the leftover ALLOCs from the livelists
+        * that are part of this metaslab.
+        */
+       zfs_btree_t mv_livelist_allocs;
+
+       /*
+        * Metaslab information.
+        */
+       uint64_t mv_vdid;
+       uint64_t mv_msid;
+       uint64_t mv_start;
+       uint64_t mv_end;
+
+       /*
+        * What's currently allocated for this metaslab.
+        */
+       range_tree_t *mv_allocated;
+} metaslab_verify_t;
+
+typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
+
+typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
+    void *arg);
+
+typedef struct unflushed_iter_cb_arg {
+       spa_t *uic_spa;
+       uint64_t uic_txg;
+       void *uic_arg;
+       zdb_log_sm_cb_t uic_cb;
+} unflushed_iter_cb_arg_t;
+
+static int
+iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
+{
+       unflushed_iter_cb_arg_t *uic = arg;
+       return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
+}
+
+static void
+iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
+{
+       if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+               return;
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+           sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+               space_map_t *sm = NULL;
+               VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+                   sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+               unflushed_iter_cb_arg_t uic = {
+                       .uic_spa = spa,
+                       .uic_txg = sls->sls_txg,
+                       .uic_arg = arg,
+                       .uic_cb = cb
+               };
+               VERIFY0(space_map_iterate(sm, space_map_length(sm),
+                   iterate_through_spacemap_logs_cb, &uic));
+               space_map_close(sm);
+       }
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
+    uint64_t offset, uint64_t size)
+{
+       sublivelist_verify_block_t svb;
+       DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
+       DVA_SET_OFFSET(&svb.svb_dva, offset);
+       DVA_SET_ASIZE(&svb.svb_dva, size);
+       zfs_btree_index_t where;
+       uint64_t end_offset = offset + size;
+
+       /*
+        *  Look for an exact match for spacemap entry in the livelist entries.
+        *  Then, look for other livelist entries that fall within the range
+        *  of the spacemap entry as it may have been condensed
+        */
+       sublivelist_verify_block_t *found =
+           zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
+       if (found == NULL) {
+               found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
+       }
+       for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
+           DVA_GET_OFFSET(&found->svb_dva) < end_offset;
+           found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+               if (found->svb_allocated_txg <= txg) {
+                       (void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
+                           "from TXG %llx FREED at TXG %llx\n",
+                           (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
+                           (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
+                           (u_longlong_t)found->svb_allocated_txg,
+                           (u_longlong_t)txg);
+               }
+       }
+}
+
+static int
+metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
+{
+       metaslab_verify_t *mv = arg;
+       uint64_t offset = sme->sme_offset;
+       uint64_t size = sme->sme_run;
+       uint64_t txg = sme->sme_txg;
+
+       if (sme->sme_type == SM_ALLOC) {
+               if (range_tree_contains(mv->mv_allocated,
+                   offset, size)) {
+                       (void) printf("ERROR: DOUBLE ALLOC: "
+                           "%llu [%llx:%llx] "
+                           "%llu:%llu LOG_SM\n",
+                           (u_longlong_t)txg, (u_longlong_t)offset,
+                           (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+                           (u_longlong_t)mv->mv_msid);
+               } else {
+                       range_tree_add(mv->mv_allocated,
+                           offset, size);
+               }
+       } else {
+               if (!range_tree_contains(mv->mv_allocated,
+                   offset, size)) {
+                       (void) printf("ERROR: DOUBLE FREE: "
+                           "%llu [%llx:%llx] "
+                           "%llu:%llu LOG_SM\n",
+                           (u_longlong_t)txg, (u_longlong_t)offset,
+                           (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+                           (u_longlong_t)mv->mv_msid);
+               } else {
+                       range_tree_remove(mv->mv_allocated,
+                           offset, size);
+               }
+       }
+
+       if (sme->sme_type != SM_ALLOC) {
+               /*
+                * If something is freed in the spacemap, verify that
+                * it is not listed as allocated in the livelist.
+                */
+               verify_livelist_allocs(mv, txg, offset, size);
+       }
+       return (0);
+}
+
+static int
+spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+       metaslab_verify_t *mv = arg;
+       uint64_t offset = sme->sme_offset;
+       uint64_t vdev_id = sme->sme_vdev;
+
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+       /* skip indirect vdevs */
+       if (!vdev_is_concrete(vd))
+               return (0);
+
+       if (vdev_id != mv->mv_vdid)
+               return (0);
+
+       metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+       if (ms->ms_id != mv->mv_msid)
+               return (0);
+
+       if (txg < metaslab_unflushed_txg(ms))
+               return (0);
+
+
+       ASSERT3U(txg, ==, sme->sme_txg);
+       return (metaslab_spacemap_validation_cb(sme, mv));
+}
+
+static void
+spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
+{
+       iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
+}
+
+static void
+spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
+{
+       if (sm == NULL)
+               return;
+
+       VERIFY0(space_map_iterate(sm, space_map_length(sm),
+           metaslab_spacemap_validation_cb, mv));
+}
+
+static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
+
+/*
+ * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
+ * they are part of that metaslab (mv_msid).
+ */
+static void
+mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
+{
+       zfs_btree_index_t where;
+       sublivelist_verify_block_t *svb;
+       ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
+       for (svb = zfs_btree_first(&sv->sv_leftover, &where);
+           svb != NULL;
+           svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
+               if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
+                       continue;
+
+               if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
+                   (DVA_GET_OFFSET(&svb->svb_dva) +
+                   DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
+                       (void) printf("ERROR: Found block that crosses "
+                           "metaslab boundary: <%llu:%llx:%llx>\n",
+                           (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+                           (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+                           (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+                       continue;
+               }
+
+               if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
+                       continue;
+
+               if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
+                       continue;
+
+               if ((DVA_GET_OFFSET(&svb->svb_dva) +
+                   DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
+                       (void) printf("ERROR: Found block that crosses "
+                           "metaslab boundary: <%llu:%llx:%llx>\n",
+                           (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+                           (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+                           (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+                       continue;
+               }
+
+               zfs_btree_add(&mv->mv_livelist_allocs, svb);
+       }
+
+       for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
+           svb != NULL;
+           svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+               zfs_btree_remove(&sv->sv_leftover, svb);
+       }
+}
+
+/*
+ * [Livelist Check]
+ * Iterate through all the sublivelists and:
+ * - report leftover frees
+ * - report double ALLOCs/FREEs
+ * - record leftover ALLOCs together with their TXG [see Cross Check]
+ *
+ * [Spacemap Check]
+ * for each metaslab:
+ * - iterate over spacemap and then the metaslab's entries in the
+ *   spacemap log, then report any double FREEs and ALLOCs (do not
+ *   blow up).
+ *
+ * [Cross Check]
+ * After finishing the Livelist Check phase and while being in the
+ * Spacemap Check phase, we find all the recorded leftover ALLOCs
+ * of the livelist check that are part of the metaslab that we are
+ * currently looking at in the Spacemap Check. We report any entries
+ * that are marked as ALLOCs in the livelists but have been actually
+ * freed (and potentially allocated again) after their TXG stamp in
+ * the spacemaps. Also report any ALLOCs from the livelists that
+ * belong to indirect vdevs (e.g. their vdev completed removal).
+ *
+ * Note that this will miss Log Spacemap entries that cancelled each other
+ * out before being flushed to the metaslab, so we are not guaranteed
+ * to match all erroneous ALLOCs.
+ */
+static void
+livelist_metaslab_validate(spa_t *spa)
+{
+       (void) printf("Verifying deleted livelist entries\n");
+
+       sublivelist_verify_t sv;
+       zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+           sizeof (sublivelist_verify_block_t));
+       iterate_deleted_livelists(spa, livelist_verify, &sv);
+
+       (void) printf("Verifying metaslab entries\n");
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+
+               if (!vdev_is_concrete(vd))
+                       continue;
+
+               for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
+                       metaslab_t *m = vd->vdev_ms[mid];
+
+                       (void) fprintf(stderr,
+                           "\rverifying concrete vdev %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)vd->vdev_id,
+                           (longlong_t)mid,
+                           (longlong_t)vd->vdev_ms_count);
+
+                       uint64_t shift, start;
+                       range_seg_type_t type =
+                           metaslab_calculate_range_tree_type(vd, m,
+                           &start, &shift);
+                       metaslab_verify_t mv;
+                       mv.mv_allocated = range_tree_create(NULL,
+                           type, NULL, start, shift);
+                       mv.mv_vdid = vd->vdev_id;
+                       mv.mv_msid = m->ms_id;
+                       mv.mv_start = m->ms_start;
+                       mv.mv_end = m->ms_start + m->ms_size;
+                       zfs_btree_create(&mv.mv_livelist_allocs,
+                           livelist_block_compare,
+                           sizeof (sublivelist_verify_block_t));
+
+                       mv_populate_livelist_allocs(&mv, &sv);
+
+                       spacemap_check_ms_sm(m->ms_sm, &mv);
+                       spacemap_check_sm_log(spa, &mv);
+
+                       range_tree_vacate(mv.mv_allocated, NULL, NULL);
+                       range_tree_destroy(mv.mv_allocated);
+                       zfs_btree_clear(&mv.mv_livelist_allocs);
+                       zfs_btree_destroy(&mv.mv_livelist_allocs);
+               }
+       }
+       (void) fprintf(stderr, "\n");
+
+       /*
+        * If there are any segments in the leftover tree after we walked
+        * through all the metaslabs in the concrete vdevs then this means
+        * that we have segments in the livelists that belong to indirect
+        * vdevs and are marked as allocated.
+        */
+       if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
+               zfs_btree_destroy(&sv.sv_leftover);
+               return;
+       }
+       (void) printf("ERROR: Found livelist blocks marked as allocated "
+           "for indirect vdevs:\n");
+
+       zfs_btree_index_t *where = NULL;
+       sublivelist_verify_block_t *svb;
+       while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
+           NULL) {
+               int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
+               ASSERT3U(vdev_id, <, rvd->vdev_children);
+               vdev_t *vd = rvd->vdev_child[vdev_id];
+               ASSERT(!vdev_is_concrete(vd));
+               (void) printf("<%d:%llx:%llx> TXG %llx\n",
+                   vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+                   (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
+                   (u_longlong_t)svb->svb_allocated_txg);
+       }
+       (void) printf("\n");
+       zfs_btree_destroy(&sv.sv_leftover);
+}
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -172,7 +738,7 @@ static void
 usage(void)
 {
        (void) fprintf(stderr,
-           "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
+           "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
            "[-I <inflight I/Os>]\n"
            "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
            "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
@@ -234,7 +800,9 @@ usage(void)
        (void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
        (void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
        (void) fprintf(stderr, "        -v verbose (applies to all "
-           "others)\n\n");
+           "others)\n");
+       (void) fprintf(stderr, "        -y perform livelist and metaslab "
+           "validation on any livelists being deleted\n\n");
        (void) fprintf(stderr, "    Below options are intended for use "
            "with other options:\n");
        (void) fprintf(stderr, "        -A ignore assertions (-A), enable "
@@ -926,11 +1494,20 @@ dump_spacemap(objset_t *os, space_map_t *sm)
                    sizeof (word), &word, DMU_READ_PREFETCH));
 
                if (sm_entry_is_debug(word)) {
-                       (void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
-                           (u_longlong_t)entry_id,
-                           ddata[SM_DEBUG_ACTION_DECODE(word)],
-                           (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
-                           (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+                       uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
+                       uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
+                       if (de_txg == 0) {
+                               (void) printf(
+                                   "\t    [%6llu] PADDING\n",
+                                   (u_longlong_t)entry_id);
+                       } else {
+                               (void) printf(
+                                   "\t    [%6llu] %s: txg %llu pass %llu\n",
+                                   (u_longlong_t)entry_id,
+                                   ddata[SM_DEBUG_ACTION_DECODE(word)],
+                                   (u_longlong_t)de_txg,
+                                   (u_longlong_t)de_sync_pass);
+                       }
                        entry_id++;
                        continue;
                }
@@ -2214,6 +2791,11 @@ verify_dd_livelist(objset_t *os)
        ASSERT(!dmu_objset_is_snapshot(os));
        if (!dsl_deadlist_is_open(&dd->dd_livelist))
                return (0);
+
+       /* Iterate through the livelist to check for duplicates */
+       dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
+           NULL);
+
        dsl_pool_config_enter(dp, FTAG);
        dsl_deadlist_space(&dd->dd_livelist, &ll_used,
            &ll_comp, &ll_uncomp);
@@ -4652,50 +5234,6 @@ static metaslab_ops_t zdb_metaslab_ops = {
        NULL    /* alloc */
 };
 
-typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme,
-    uint64_t txg, void *arg);
-
-typedef struct unflushed_iter_cb_arg {
-       spa_t *uic_spa;
-       uint64_t uic_txg;
-       void *uic_arg;
-       zdb_log_sm_cb_t uic_cb;
-} unflushed_iter_cb_arg_t;
-
-static int
-iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
-{
-       unflushed_iter_cb_arg_t *uic = arg;
-       return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
-}
-
-static void
-iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
-{
-       if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
-               return;
-
-       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-       for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
-           sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
-               space_map_t *sm = NULL;
-               VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
-                   sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
-
-               unflushed_iter_cb_arg_t uic = {
-                       .uic_spa = spa,
-                       .uic_txg = sls->sls_txg,
-                       .uic_arg = arg,
-                       .uic_cb = cb
-               };
-
-               VERIFY0(space_map_iterate(sm, space_map_length(sm),
-                   iterate_through_spacemap_logs_cb, &uic));
-               space_map_close(sm);
-       }
-       spa_config_exit(spa, SCL_CONFIG, FTAG);
-}
-
 /* ARGSUSED */
 static int
 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
@@ -5443,8 +5981,6 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
  * Iterate over livelists which have been destroyed by the user but
  * are still present in the MOS, waiting to be freed
  */
-typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
-
 static void
 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
 {
@@ -5515,6 +6051,7 @@ dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
        ASSERT3P(arg, ==, NULL);
        global_feature_count[SPA_FEATURE_LIVELIST]++;
        dump_blkptr_list(ll, "Deleted Livelist");
+       dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
 }
 
 /*
@@ -6780,6 +7317,10 @@ dump_zpool(spa_t *spa)
        dsl_pool_t *dp = spa_get_dsl(spa);
        int rc = 0;
 
+       if (dump_opt['y']) {
+               livelist_metaslab_validate(spa);
+       }
+
        if (dump_opt['S']) {
                dump_simulated_ddt(spa);
                return;
@@ -6925,7 +7466,7 @@ static int flagbits[256];
 static char flagbitstr[16];
 
 static void
-zdb_print_blkptr(blkptr_t *bp, int flags)
+zdb_print_blkptr(const blkptr_t *bp, int flags)
 {
        char blkbuf[BP_SPRINTF_LEN];
 
@@ -7537,7 +8078,7 @@ main(int argc, char **argv)
        zfs_btree_verify_intensity = 3;
 
        while ((c = getopt(argc, argv,
-           "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) {
+           "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYy")) != -1) {
                switch (c) {
                case 'b':
                case 'c':
@@ -7556,6 +8097,7 @@ main(int argc, char **argv)
                case 's':
                case 'S':
                case 'u':
+               case 'y':
                        dump_opt[c]++;
                        dump_all = 0;
                        break;
@@ -7698,7 +8240,7 @@ main(int argc, char **argv)
                verbose = MAX(verbose, 1);
 
        for (c = 0; c < 256; c++) {
-               if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
+               if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL)
                        dump_opt[c] = 1;
                if (dump_opt[c])
                        dump_opt[c] += verbose;
index ca38271cc4acbb996e7d06575188d10a8d39b0a4..0a3653f7fffadd65cc36b917832dfa9291ef7a66 100644 (file)
@@ -6469,7 +6469,7 @@ ztest_run_zdb(char *pool)
        ztest_get_zdb_bin(bin, len);
 
        (void) sprintf(zdb,
-           "%s -bcc%s%s -G -d -Y -e -p %s %s",
+           "%s -bcc%s%s -G -d -Y -e -y -p %s %s",
            bin,
            ztest_opts.zo_verbose >= 3 ? "s" : "",
            ztest_opts.zo_verbose >= 4 ? "v" : "",
index f8d9c6a82e2b1005f8827e5b3c75f49f750fbbcc..b3b7f865536ed69a4c88bd422157ae0eb9bbd8c1 100644 (file)
@@ -137,6 +137,9 @@ void metaslab_set_selected_txg(metaslab_t *, uint64_t);
 
 extern int metaslab_debug_load;
 
+range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev,
+    metaslab_t *msp, uint64_t *start, uint64_t *shift);
+
 #ifdef __cplusplus
 }
 #endif
index 81f56076a5695f8a66cd7b7b0d53266226f0db2d..cb81e710bd1e8e0636d6936c39b816fae5444bab 100644 (file)
@@ -148,6 +148,15 @@ typedef struct space_map_entry {
        uint32_t sme_vdev;      /* max is 2^24-1; SM_NO_VDEVID if not present */
        uint64_t sme_offset;    /* max is 2^63-1; units of sm_shift */
        uint64_t sme_run;       /* max is 2^36; units of sm_shift */
+
+       /*
+        * The following fields are not part of the actual space map entry
+        * on-disk and they are populated with the values from the debug
+        * entry most recently visited starting from the beginning to the
+        * end of the space map.
+        */
+       uint64_t sme_txg;
+       uint64_t sme_sync_pass;
 } space_map_entry_t;
 
 #define        SM_NO_VDEVID    (1 << SPA_VDEVBITS)
index e8320c35b1985c02e133dd97455b3afb2d2e165d..56cb02dce71a7c9085229f2fe324d16a0cf091c0 100644 (file)
@@ -10,7 +10,7 @@
 .\"
 .\"
 .\" Copyright 2012, Richard Lowe.
-.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2019 by Delphix. All rights reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
@@ -23,7 +23,7 @@
 .Nd display zpool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
-.Op Fl AbcdDFGhikLMPsvXY
+.Op Fl AbcdDFGhikLMPsvXYy
 .Op Fl e Oo Fl V Oc Op Fl p Ar path ...
 .Op Fl I Ar inflight I/Os
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns ...
@@ -403,6 +403,12 @@ but read transactions otherwise deemed too old.
 Attempt all possible combinations when reconstructing indirect split blocks.
 This flag disables the individual I/O deadman timer in order to allow as
 much time as required for the attempted reconstruction.
+.It Fl y
+Perform validation for livelists that are being deleted.
+Scans through the livelist and metaslabs, checking for duplicate entries
+and compares the two, checking for potential double frees.
+If it encounters issues, warnings will be printed, but the command will not
+necessarily fail.
 .El
 .Pp
 Specifying a display option more than once enables verbosity for only that
index 762038cb395360a1196f7ae44e401e24614dff1c..a935f33cbd5cf296e2df9505e43e950b0eadb2d9 100644 (file)
@@ -2533,7 +2533,7 @@ metaslab_unload(metaslab_t *msp)
  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
  * the ranges using two uint32_ts, rather than two uint64_ts.
  */
-static range_seg_type_t
+range_seg_type_t
 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
     uint64_t *start, uint64_t *shift)
 {
index eb2c36942543d88662206d8429f2a7456477a8a7..25da0e63c15fce6958e76972e34e7bd27e3afaff 100644 (file)
@@ -96,6 +96,7 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
            ZIO_PRIORITY_SYNC_READ);
 
        int error = 0;
+       uint64_t txg = 0, sync_pass = 0;
        for (uint64_t block_base = 0; block_base < end && error == 0;
            block_base += blksz) {
                dmu_buf_t *db;
@@ -117,8 +118,29 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
                    block_cursor < block_end && error == 0; block_cursor++) {
                        uint64_t e = *block_cursor;
 
-                       if (sm_entry_is_debug(e)) /* Skip debug entries */
+                       if (sm_entry_is_debug(e)) {
+                               /*
+                                * Debug entries are only needed to record the
+                                * current TXG and sync pass if available.
+                                *
+                                * Note though that sometimes there can be
+                                * debug entries that are used as padding
+                                * at the end of space map blocks in-order
+                                * to not split a double-word entry in the
+                                * middle between two blocks. These entries
+                                * have their TXG field set to 0 and we
+                                * skip them without recording the TXG.
+                                * [see comment in space_map_write_seg()]
+                                */
+                               uint64_t e_txg = SM_DEBUG_TXG_DECODE(e);
+                               if (e_txg != 0) {
+                                       txg = e_txg;
+                                       sync_pass = SM_DEBUG_SYNCPASS_DECODE(e);
+                               } else {
+                                       ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e));
+                               }
                                continue;
+                       }
 
                        uint64_t raw_offset, raw_run, vdev_id;
                        maptype_t type;
@@ -158,7 +180,9 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
                            .sme_type = type,
                            .sme_vdev = vdev_id,
                            .sme_offset = entry_offset,
-                           .sme_run = entry_run
+                           .sme_run = entry_run,
+                           .sme_txg = txg,
+                           .sme_sync_pass = sync_pass
                        };
                        error = callback(&sme, arg);
                }
index 508f20adbaafb4f9396eb7fcf56595c65d99ced6..1d344cf2acc3ce40019365f9fa14671c2cb20e94 100755 (executable)
@@ -57,7 +57,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
     "add raidz1 fakepool" "add raidz2 fakepool" \
     "setvprop" "blah blah" "-%" "--?" "-*" "-=" \
     "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" "-r" \
-    "-t" "-w" "-y" "-z" "-E" "-H" "-I" "-J" "-K" \
+    "-t" "-w" "-z" "-E" "-H" "-I" "-J" "-K" \
     "-N" "-Q" "-R" "-T" "-W" "-Z"
 
 log_assert "Execute zdb using invalid parameters."
index 4bb5c3c4a029360cf2c3326fde26de2fd51606d4..4c2fc15ec025f88144063c18e043a57cace9153e 100755 (executable)
@@ -58,7 +58,7 @@ function cleanup
 function test_imported_pool
 {
        typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-G" "-h" "-i" "-L" \
-            "-M" "-P" "-s" "-v" "-Y")
+            "-M" "-P" "-s" "-v" "-Y" "-y")
         for i in ${args[@]}; do
                log_must eval "zdb $i $TESTPOOL >/dev/null"
        done
@@ -68,7 +68,7 @@ function test_exported_pool
 {
        log_must zpool export $TESTPOOL
        typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-F" "-G" "-h" "-i" "-L" "-M" \
-            "-P" "-s" "-v" "-X" "-Y")
+            "-P" "-s" "-v" "-X" "-Y" "-y")
         for i in ${args[@]}; do
                log_must eval "zdb -e $i $TESTPOOL >/dev/null"
        done