]> git.proxmox.com Git - mirror_zfs.git/commitdiff
zdb: include cloned blocks in block statistics
authorRob N <rob.norris@klarasystems.com>
Tue, 1 Aug 2023 15:56:30 +0000 (01:56 +1000)
committerGitHub <noreply@github.com>
Tue, 1 Aug 2023 15:56:30 +0000 (08:56 -0700)
This gives `zdb -b` support for clone blocks.

Previously, it didn't know what clones were, so would count their space
allocation multiple times and then report leaked space (or, in debug,
would assert trying to claim blocks a second time).

This commit fixes those bugs, and reports the number of clones and the
space "used" (saved) by them.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Closes #15123

cmd/zdb/zdb.c
include/sys/brt.h
module/zfs/brt.c

index 9568d2bbfe387fb29d119baf1ef6ec2d7566b931..4b9921d47b81ba345ec7d972da48899bd5906424 100644 (file)
@@ -79,6 +79,7 @@
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
+#include <sys/brt.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 
@@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = {
 #define        ZB_TOTAL        DN_MAX_LEVELS
 #define        SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
 
+typedef struct zdb_brt_entry {
+       dva_t           zbre_dva;
+       uint64_t        zbre_refcount;
+       avl_node_t      zbre_node;
+} zdb_brt_entry_t;
+
 typedef struct zdb_cb {
        zdb_blkstats_t  zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
        uint64_t        zcb_removing_size;
        uint64_t        zcb_checkpoint_size;
        uint64_t        zcb_dedup_asize;
        uint64_t        zcb_dedup_blocks;
+       uint64_t        zcb_clone_asize;
+       uint64_t        zcb_clone_blocks;
        uint64_t        zcb_psize_count[SPA_MAX_FOR_16M];
        uint64_t        zcb_lsize_count[SPA_MAX_FOR_16M];
        uint64_t        zcb_asize_count[SPA_MAX_FOR_16M];
@@ -5368,6 +5377,8 @@ typedef struct zdb_cb {
        int             zcb_haderrors;
        spa_t           *zcb_spa;
        uint32_t        **zcb_vd_obsolete_counts;
+       avl_tree_t      zcb_brt;
+       boolean_t       zcb_brt_is_active;
 } zdb_cb_t;
 
 /* test if two DVA offsets from same vdev are within the same metaslab */
@@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
        zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
        zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
+       if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
+               /*
+                * Cloned blocks are special. We need to count them, so we can
+                * later uncount them when reporting leaked space, and we must
+                * only claim them them once.
+                *
+                * To do this, we keep our own in-memory BRT. For each block
+                * we haven't seen before, we look it up in the real BRT and
+                * if its there, we note it and its refcount then proceed as
+                * normal. If we see the block again, we count it as a clone
+                * and then give it no further consideration.
+                */
+               zdb_brt_entry_t zbre_search, *zbre;
+               avl_index_t where;
+
+               zbre_search.zbre_dva = bp->blk_dva[0];
+               zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+               if (zbre != NULL) {
+                       zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+                       zcb->zcb_clone_blocks++;
+
+                       zbre->zbre_refcount--;
+                       if (zbre->zbre_refcount == 0) {
+                               avl_remove(&zcb->zcb_brt, zbre);
+                               umem_free(zbre, sizeof (zdb_brt_entry_t));
+                       }
+                       return;
+               }
+
+               uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
+               if (crefcnt > 0) {
+                       zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+                           UMEM_NOFAIL);
+                       zbre->zbre_dva = bp->blk_dva[0];
+                       zbre->zbre_refcount = crefcnt;
+                       avl_insert(&zcb->zcb_brt, zbre, where);
+               }
+       }
+
        if (dump_opt['L'])
                return;
 
@@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa)
        iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }
 
+static int
+zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
+{
+       const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
+       const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
+       int cmp;
+
+       cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+       if (cmp == 0)
+               cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
+
+       return (cmp);
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
@@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa)
 
        zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
+       if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+               avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
+                   sizeof (zdb_brt_entry_t),
+                   offsetof(zdb_brt_entry_t, zbre_node));
+               zcb->zcb_brt_is_active = B_TRUE;
+       }
+
        (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
            (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
            (dump_opt['c'] == 1) ? "metadata " : "",
@@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa)
            metaslab_class_get_alloc(spa_special_class(spa)) +
            metaslab_class_get_alloc(spa_dedup_class(spa)) +
            get_unflushed_alloc_space(spa);
-       total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
+       total_found =
+           tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
            zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
 
        if (total_found == total_alloc && !dump_opt['L']) {
@@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa)
            "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
            (u_longlong_t)zcb->zcb_dedup_blocks,
            (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
+       (void) printf("\t%-16s %14llu    count: %6llu\n",
+           "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
+           (u_longlong_t)zcb->zcb_clone_blocks);
        (void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
            (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
index 0761159e3f5ff0c485851aa06206ed2215798776..f73df95058d9799d22b8ec2ea59a1cea30a56050 100644 (file)
@@ -36,6 +36,7 @@ extern "C" {
 #endif
 
 extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
+extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp);
 
 extern uint64_t brt_get_dspace(spa_t *spa);
 extern uint64_t brt_get_used(spa_t *spa);
index 877b503a1bf212ad18a2056b55d9db16f8d57a21..e8218fb268888b89519ab81cb5cf75a060d230a8 100644 (file)
@@ -1544,6 +1544,37 @@ out:
        return (B_FALSE);
 }
 
+uint64_t
+brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
+{
+       brt_t *brt = spa->spa_brt;
+       brt_vdev_t *brtvd;
+       brt_entry_t bre_search, *bre;
+       uint64_t vdevid, refcnt;
+       int error;
+
+       brt_entry_fill(bp, &bre_search, &vdevid);
+
+       brt_rlock(brt);
+
+       brtvd = brt_vdev(brt, vdevid);
+       ASSERT(brtvd != NULL);
+
+       bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+       if (bre == NULL) {
+               error = brt_entry_lookup(brt, brtvd, &bre_search);
+               ASSERT(error == 0 || error == ENOENT);
+               if (error == ENOENT)
+                       refcnt = 0;
+               else
+                       refcnt = bre_search.bre_refcount;
+       } else
+               refcnt = bre->bre_refcount;
+
+       brt_unlock(brt);
+       return (refcnt);
+}
+
 static void
 brt_prefetch(brt_t *brt, const blkptr_t *bp)
 {