]> git.proxmox.com Git - mirror_zfs-debian.git/blobdiff - cmd/zdb/zdb.c
Imported Upstream version 0.6.5.3
[mirror_zfs-debian.git] / cmd / zdb / zdb.c
index 1d76f2a7da3e7dc4cc029facde273d5a12526c63..56f56700fc10058f642df2a818ecb8731e7dd0ed 100644 (file)
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Intel Corporation.
  */
 
 #include <stdio.h>
 #ifndef lint
 extern int zfs_recover;
 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
 #else
 int zfs_recover;
 uint64_t zfs_arc_max, zfs_arc_meta_limit;
+int zfs_vdev_async_read_max_active;
 #endif
 
 const char cmdname[] = "zdb";
@@ -92,6 +95,8 @@ int zopt_objects = 0;
 libzfs_handle_t *g_zfs;
 uint64_t max_inflight = 1000;
 
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
+
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
@@ -413,6 +418,81 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
        zap_cursor_fini(&zc);
 }
 
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+       bpobj_phys_t *bpop = data;
+       uint64_t i;
+       char bytes[32], comp[32], uncomp[32];
+
+       if (bpop == NULL)
+               return;
+
+       zdb_nicenum(bpop->bpo_bytes, bytes);
+       zdb_nicenum(bpop->bpo_comp, comp);
+       zdb_nicenum(bpop->bpo_uncomp, uncomp);
+
+       (void) printf("\t\tnum_blkptrs = %llu\n",
+           (u_longlong_t)bpop->bpo_num_blkptrs);
+       (void) printf("\t\tbytes = %s\n", bytes);
+       if (size >= BPOBJ_SIZE_V1) {
+               (void) printf("\t\tcomp = %s\n", comp);
+               (void) printf("\t\tuncomp = %s\n", uncomp);
+       }
+       if (size >= sizeof (*bpop)) {
+               (void) printf("\t\tsubobjs = %llu\n",
+                   (u_longlong_t)bpop->bpo_subobjs);
+               (void) printf("\t\tnum_subobjs = %llu\n",
+                   (u_longlong_t)bpop->bpo_num_subobjs);
+       }
+
+       if (dump_opt['d'] < 5)
+               return;
+
+       for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
+               char blkbuf[BP_SPRINTF_LEN];
+               blkptr_t bp;
+
+               int err = dmu_read(os, object,
+                   i * sizeof (bp), sizeof (bp), &bp, 0);
+               if (err != 0) {
+                       (void) printf("got error %u from dmu_read\n", err);
+                       break;
+               }
+               snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
+               (void) printf("\t%s\n", blkbuf);
+       }
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+       dmu_object_info_t doi;
+       uint64_t i;
+
+       VERIFY0(dmu_object_info(os, object, &doi));
+       uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+       int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+       if (err != 0) {
+               (void) printf("got error %u from dmu_read\n", err);
+               kmem_free(subobjs, doi.doi_max_offset);
+               return;
+       }
+
+       int64_t last_nonzero = -1;
+       for (i = 0; i < doi.doi_max_offset / 8; i++) {
+               if (subobjs[i] != 0)
+                       last_nonzero = i;
+       }
+
+       for (i = 0; i <= last_nonzero; i++) {
+               (void) printf("\t%llu\n", (longlong_t)subobjs[i]);
+       }
+       kmem_free(subobjs, doi.doi_max_offset);
+}
+
 /*ARGSUSED*/
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
@@ -990,7 +1070,7 @@ static void
 dump_history(spa_t *spa)
 {
        nvlist_t **events = NULL;
-       char buf[SPA_MAXBLOCKSIZE];
+       char *buf;
        uint64_t resid, len, off = 0;
        uint_t num = 0;
        int error;
@@ -1000,12 +1080,19 @@ dump_history(spa_t *spa)
        char internalstr[MAXPATHLEN];
        int i;
 
+       if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
+               (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
+                   __func__);
+               return;
+       }
+
        do {
-               len = sizeof (buf);
+               len = SPA_OLD_MAXBLOCKSIZE;
 
                if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
                        (void) fprintf(stderr, "Unable to read history: "
                            "error %d\n", error);
+                       free(buf);
                        return;
                }
 
@@ -1056,6 +1143,7 @@ next:
                        dump_nvlist(events[i], 2);
                }
        }
+       free(buf);
 }
 
 /*ARGSUSED*/
@@ -1171,7 +1259,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
        print_indirect(bp, zb, dnp);
 
        if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
-               uint32_t flags = ARC_WAIT;
+               arc_flags_t flags = ARC_FLAG_WAIT;
                int i;
                blkptr_t *cbp;
                int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@@ -1384,7 +1472,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 }
 
 static void
-dump_bpobj(bpobj_t *bpo, char *name, int indent)
+dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
 {
        char bytes[32];
        char comp[32];
@@ -1399,11 +1487,12 @@ dump_bpobj(bpobj_t *bpo, char *name, int indent)
                zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
                zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
                (void) printf("    %*s: object %llu, %llu local blkptrs, "
-                   "%llu subobjs, %s (%s/%s comp)\n",
+                   "%llu subobjs in object, %llu, %s (%s/%s comp)\n",
                    indent * 8, name,
                    (u_longlong_t)bpo->bpo_object,
                    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
                    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+                   (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
                    bytes, comp, uncomp);
 
                for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
@@ -1420,7 +1509,7 @@ dump_bpobj(bpobj_t *bpo, char *name, int indent)
                                    error, (u_longlong_t)subobj);
                                continue;
                        }
-                       dump_bpobj(&subbpo, "subobj", indent + 1);
+                       dump_full_bpobj(&subbpo, "subobj", indent + 1);
                }
        } else {
                (void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
@@ -1453,7 +1542,7 @@ dump_deadlist(dsl_deadlist_t *dl)
                return;
 
        if (dl->dl_oldfmt) {
-               dump_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+               dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
                return;
        }
 
@@ -1480,7 +1569,7 @@ dump_deadlist(dsl_deadlist_t *dl)
                            (longlong_t)dle->dle_mintxg,
                            (longlong_t)dle->dle_bpobj.bpo_object);
 
-                       dump_bpobj(&dle->dle_bpobj, buf, 0);
+                       dump_full_bpobj(&dle->dle_bpobj, buf, 0);
                } else {
                        (void) printf("mintxg %llu -> obj %llu\n",
                            (longlong_t)dle->dle_mintxg,
@@ -1728,8 +1817,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
        dump_uint64,            /* object array                 */
        dump_none,              /* packed nvlist                */
        dump_packed_nvlist,     /* packed nvlist size           */
-       dump_none,              /* bplist                       */
-       dump_none,              /* bplist header                */
+       dump_none,              /* bpobj                        */
+       dump_bpobj,             /* bpobj header                 */
        dump_none,              /* SPA space map header         */
        dump_none,              /* SPA space map                */
        dump_none,              /* ZIL intent log               */
@@ -1776,7 +1865,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
        dump_zap,               /* deadlist                     */
        dump_none,              /* deadlist hdr                 */
        dump_zap,               /* dsl clones                   */
-       dump_none,              /* bpobj subobjs                */
+       dump_bpobj_subobjs,     /* bpobj subobjs                */
        dump_unknown,           /* Unknown type, must be last   */
 };
 
@@ -1928,8 +2017,8 @@ dump_dir(objset_t *os)
        if (dds.dds_type == DMU_OST_META) {
                dds.dds_creation_txg = TXG_INITIAL;
                usedobjs = BP_GET_FILL(os->os_rootbp);
-               refdbytes = os->os_spa->spa_dsl_pool->
-                   dp_mos_dir->dd_phys->dd_used_bytes;
+               refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+                   dd_used_bytes;
        } else {
                dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
        }
@@ -2185,6 +2274,8 @@ dump_label(const char *dev)
        (void) close(fd);
 }
 
+static uint64_t num_large_blocks;
+
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
@@ -2197,6 +2288,8 @@ dump_one_dir(const char *dsname, void *arg)
                (void) printf("Could not open %s, error %d\n", dsname, error);
                return (0);
        }
+       if (dmu_objset_ds(os)->ds_large_blocks)
+               num_large_blocks++;
        dump_dir(os);
        dmu_objset_disown(os, FTAG);
        fuid_table_destroy();
@@ -2207,12 +2300,14 @@ dump_one_dir(const char *dsname, void *arg)
 /*
  * Block statistics.
  */
-#define        PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define        PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
        uint64_t zb_asize;
        uint64_t zb_lsize;
        uint64_t zb_psize;
        uint64_t zb_count;
+       uint64_t zb_gangs;
+       uint64_t zb_ditto_samevdev;
        uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
@@ -2264,13 +2359,43 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
        for (i = 0; i < 4; i++) {
                int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
                int t = (i & 1) ? type : ZDB_OT_TOTAL;
+               int equal;
                zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
                zb->zb_asize += BP_GET_ASIZE(bp);
                zb->zb_lsize += BP_GET_LSIZE(bp);
                zb->zb_psize += BP_GET_PSIZE(bp);
                zb->zb_count++;
-               zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+               /*
+                * The histogram is only big enough to record blocks up to
+                * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+                * "other", bucket.
+                */
+               int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+               idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+               zb->zb_psize_histogram[idx]++;
+
+               zb->zb_gangs += BP_COUNT_GANG(bp);
+
+               switch (BP_GET_NDVAS(bp)) {
+               case 2:
+                       if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[1]))
+                               zb->zb_ditto_samevdev++;
+                       break;
+               case 3:
+                       equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[1])) +
+                           (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[2])) +
+                           (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[2]));
+                       if (equal != 0)
+                               zb->zb_ditto_samevdev++;
+                       break;
+               }
+
        }
 
        if (BP_IS_EMBEDDED(bp)) {
@@ -2399,8 +2524,14 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
        zcb->zcb_readfails = 0;
 
-       if (dump_opt['b'] < 5 &&
-           gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+       /* only call gethrtime() every 100 blocks */
+       static int iters;
+       if (++iters > 100)
+               iters = 0;
+       else
+               return (0);
+
+       if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
                uint64_t now = gethrtime();
                char buf[10];
                uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
@@ -2511,6 +2642,14 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
                                            (longlong_t)vd->vdev_ms_count);
 
                                        msp->ms_ops = &zdb_metaslab_ops;
+
+                                       /*
+                                        * We don't want to spend the CPU
+                                        * manipulating the size-ordered
+                                        * tree, so clear the range_tree
+                                        * ops.
+                                        */
+                                       msp->ms_tree->rt_ops = NULL;
                                        VERIFY0(space_map_load(msp->ms_sm,
                                            msp->ms_tree, SM_ALLOC));
                                        msp->ms_loaded = B_TRUE;
@@ -2685,6 +2824,8 @@ dump_block_stats(spa_t *spa)
        (void) printf("\n");
        (void) printf("\tbp count:      %10llu\n",
            (u_longlong_t)tzb->zb_count);
+       (void) printf("\tganged count:  %10llu\n",
+           (longlong_t)tzb->zb_gangs);
        (void) printf("\tbp logical:    %10llu      avg: %6llu\n",
            (u_longlong_t)tzb->zb_lsize,
            (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
@@ -2723,6 +2864,11 @@ dump_block_stats(spa_t *spa)
                }
        }
 
+       if (tzb->zb_ditto_samevdev != 0) {
+               (void) printf("\tDittoed blocks on same vdev: %llu\n",
+                   (longlong_t)tzb->zb_ditto_samevdev);
+       }
+
        if (dump_opt['b'] >= 2) {
                int l, t, level;
                (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
@@ -2730,7 +2876,7 @@ dump_block_stats(spa_t *spa)
 
                for (t = 0; t <= ZDB_OT_TOTAL; t++) {
                        char csize[32], lsize[32], psize[32], asize[32];
-                       char avg[32];
+                       char avg[32], gang[32];
                        char *typename;
 
                        if (t < DMU_OT_NUMTYPES)
@@ -2771,6 +2917,7 @@ dump_block_stats(spa_t *spa)
                                zdb_nicenum(zb->zb_psize, psize);
                                zdb_nicenum(zb->zb_asize, asize);
                                zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
+                               zdb_nicenum(zb->zb_gangs, gang);
 
                                (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
                                    "\t%5.2f\t%6.2f\t",
@@ -2784,6 +2931,11 @@ dump_block_stats(spa_t *spa)
                                        (void) printf("    L%d %s\n",
                                            level, typename);
 
+                               if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+                                       (void) printf("\t number of ganged "
+                                           "blocks: %s\n", gang);
+                               }
+
                                if (dump_opt['b'] >= 4) {
                                        (void) printf("psize "
                                            "(in 512-byte sectors): "
@@ -2942,12 +3094,15 @@ dump_zpool(spa_t *spa)
                dump_metaslab_groups(spa);
 
        if (dump_opt['d'] || dump_opt['i']) {
+               uint64_t refcount;
+
                dump_dir(dp->dp_meta_objset);
                if (dump_opt['d'] >= 3) {
-                       dump_bpobj(&spa->spa_deferred_bpobj,
+                       dump_full_bpobj(&spa->spa_deferred_bpobj,
                            "Deferred frees", 0);
                        if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
-                               dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
+                               dump_full_bpobj(
+                                   &spa->spa_dsl_pool->dp_free_bpobj,
                                    "Pool snapshot frees", 0);
                        }
 
@@ -2961,8 +3116,24 @@ dump_zpool(spa_t *spa)
                }
                (void) dmu_objset_find(spa_name(spa), dump_one_dir,
                    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+               if (feature_get_refcount(spa,
+                   &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS],
+                   &refcount) != ENOTSUP) {
+                       if (num_large_blocks != refcount) {
+                               (void) printf("large_blocks feature refcount "
+                                   "mismatch: expected %lld != actual %lld\n",
+                                   (longlong_t)num_large_blocks,
+                                   (longlong_t)refcount);
+                               rc = 2;
+                       } else {
+                               (void) printf("Verified large_blocks feature "
+                                   "refcount is correct (%llu)\n",
+                                   (longlong_t)refcount);
+                       }
+               }
        }
-       if (dump_opt['b'] || dump_opt['c'])
+       if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
                rc = dump_block_stats(spa);
 
        if (rc == 0)
@@ -3430,6 +3601,7 @@ main(int argc, char **argv)
        int rewind = ZPOOL_NEVER_REWIND;
        char *spa_config_path_env;
        const char *opts = "bcdhilmMI:suCDRSAFLXevp:t:U:P";
+       boolean_t target_is_spa = B_TRUE;
 
        (void) setrlimit(RLIMIT_NOFILE, &rl);
        (void) enable_extended_FILE_stdio(-1, -1);
@@ -3532,9 +3704,18 @@ main(int argc, char **argv)
        zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
 #endif
 
+       /*
+        * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+        * "zdb -b" uses traversal prefetch which uses async reads.
+        * For good performance, let several of them be active at once.
+        */
+       zfs_vdev_async_read_max_active = 10;
+
        kernel_init(FREAD);
-       if ((g_zfs = libzfs_init()) == NULL)
+       if ((g_zfs = libzfs_init()) == NULL) {
+               (void) fprintf(stderr, "%s", libzfs_error_init(errno));
                return (1);
+       }
 
        if (dump_all)
                verbose = MAX(verbose, 1);
@@ -3598,8 +3779,17 @@ main(int argc, char **argv)
                }
        }
 
+       if (strpbrk(target, "/@") != NULL) {
+               size_t targetlen;
+
+               target_is_spa = B_FALSE;
+               targetlen = strlen(target);
+               if (targetlen && target[targetlen - 1] == '/')
+                       target[targetlen - 1] = '\0';
+       }
+
        if (error == 0) {
-               if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
+               if (target_is_spa || dump_opt['R']) {
                        error = spa_open_rewind(target, &spa, FTAG, policy,
                            NULL);
                        if (error) {