/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Intel Corporation.
*/
#include <stdio.h>
#ifndef lint
extern int zfs_recover;
extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
#else
int zfs_recover;
uint64_t zfs_arc_max, zfs_arc_meta_limit;
+int zfs_vdev_async_read_max_active;
#endif
const char cmdname[] = "zdb";
libzfs_handle_t *g_zfs;
uint64_t max_inflight = 1000;
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
+
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
* debugging facilities.
zap_cursor_fini(&zc);
}
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ bpobj_phys_t *bpop = data;
+ uint64_t i;
+ char bytes[32], comp[32], uncomp[32];
+
+ if (bpop == NULL)
+ return;
+
+ zdb_nicenum(bpop->bpo_bytes, bytes);
+ zdb_nicenum(bpop->bpo_comp, comp);
+ zdb_nicenum(bpop->bpo_uncomp, uncomp);
+
+ (void) printf("\t\tnum_blkptrs = %llu\n",
+ (u_longlong_t)bpop->bpo_num_blkptrs);
+ (void) printf("\t\tbytes = %s\n", bytes);
+ if (size >= BPOBJ_SIZE_V1) {
+ (void) printf("\t\tcomp = %s\n", comp);
+ (void) printf("\t\tuncomp = %s\n", uncomp);
+ }
+ if (size >= sizeof (*bpop)) {
+ (void) printf("\t\tsubobjs = %llu\n",
+ (u_longlong_t)bpop->bpo_subobjs);
+ (void) printf("\t\tnum_subobjs = %llu\n",
+ (u_longlong_t)bpop->bpo_num_subobjs);
+ }
+
+ if (dump_opt['d'] < 5)
+ return;
+
+ for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t bp;
+
+ int err = dmu_read(os, object,
+ i * sizeof (bp), sizeof (bp), &bp, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ break;
+ }
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
+ (void) printf("\t%s\n", blkbuf);
+ }
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dmu_object_info_t doi;
+ uint64_t i;
+
+ VERIFY0(dmu_object_info(os, object, &doi));
+ uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+ int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ kmem_free(subobjs, doi.doi_max_offset);
+ return;
+ }
+
+ int64_t last_nonzero = -1;
+ for (i = 0; i < doi.doi_max_offset / 8; i++) {
+ if (subobjs[i] != 0)
+ last_nonzero = i;
+ }
+
+ for (i = 0; i <= last_nonzero; i++) {
+ (void) printf("\t%llu\n", (longlong_t)subobjs[i]);
+ }
+ kmem_free(subobjs, doi.doi_max_offset);
+}
+
/*ARGSUSED*/
static void
dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
dump_history(spa_t *spa)
{
nvlist_t **events = NULL;
- char buf[SPA_MAXBLOCKSIZE];
+ char *buf;
uint64_t resid, len, off = 0;
uint_t num = 0;
int error;
char internalstr[MAXPATHLEN];
int i;
+ if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
+ (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
+ __func__);
+ return;
+ }
+
do {
- len = sizeof (buf);
+ len = SPA_OLD_MAXBLOCKSIZE;
if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
(void) fprintf(stderr, "Unable to read history: "
"error %d\n", error);
+ free(buf);
return;
}
dump_nvlist(events[i], 2);
}
}
+ free(buf);
}
/*ARGSUSED*/
print_indirect(bp, zb, dnp);
if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
- uint32_t flags = ARC_WAIT;
+ arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
}
static void
-dump_bpobj(bpobj_t *bpo, char *name, int indent)
+dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
{
char bytes[32];
char comp[32];
zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
(void) printf(" %*s: object %llu, %llu local blkptrs, "
- "%llu subobjs, %s (%s/%s comp)\n",
+ "%llu subobjs in object, %llu, %s (%s/%s comp)\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+ (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
bytes, comp, uncomp);
for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
error, (u_longlong_t)subobj);
continue;
}
- dump_bpobj(&subbpo, "subobj", indent + 1);
+ dump_full_bpobj(&subbpo, "subobj", indent + 1);
}
} else {
(void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
return;
if (dl->dl_oldfmt) {
- dump_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+ dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
return;
}
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
- dump_bpobj(&dle->dle_bpobj, buf, 0);
+ dump_full_bpobj(&dle->dle_bpobj, buf, 0);
} else {
(void) printf("mintxg %llu -> obj %llu\n",
(longlong_t)dle->dle_mintxg,
dump_uint64, /* object array */
dump_none, /* packed nvlist */
dump_packed_nvlist, /* packed nvlist size */
- dump_none, /* bplist */
- dump_none, /* bplist header */
+ dump_none, /* bpobj */
+ dump_bpobj, /* bpobj header */
dump_none, /* SPA space map header */
dump_none, /* SPA space map */
dump_none, /* ZIL intent log */
dump_zap, /* deadlist */
dump_none, /* deadlist hdr */
dump_zap, /* dsl clones */
- dump_none, /* bpobj subobjs */
+ dump_bpobj_subobjs, /* bpobj subobjs */
dump_unknown, /* Unknown type, must be last */
};
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
usedobjs = BP_GET_FILL(os->os_rootbp);
- refdbytes = os->os_spa->spa_dsl_pool->
- dp_mos_dir->dd_phys->dd_used_bytes;
+ refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+ dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
(void) close(fd);
}
+static uint64_t num_large_blocks;
+
/*ARGSUSED*/
static int
dump_one_dir(const char *dsname, void *arg)
(void) printf("Could not open %s, error %d\n", dsname, error);
return (0);
}
+ if (dmu_objset_ds(os)->ds_large_blocks)
+ num_large_blocks++;
dump_dir(os);
dmu_objset_disown(os, FTAG);
fuid_table_destroy();
/*
* Block statistics.
*/
-#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
typedef struct zdb_blkstats {
uint64_t zb_asize;
uint64_t zb_lsize;
uint64_t zb_psize;
uint64_t zb_count;
+ uint64_t zb_gangs;
+ uint64_t zb_ditto_samevdev;
uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
} zdb_blkstats_t;
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
+ int equal;
zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
zb->zb_asize += BP_GET_ASIZE(bp);
zb->zb_lsize += BP_GET_LSIZE(bp);
zb->zb_psize += BP_GET_PSIZE(bp);
zb->zb_count++;
- zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+ /*
+ * The histogram is only big enough to record blocks up to
+ * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+ * "other", bucket.
+ */
+ int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+ idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+ zb->zb_psize_histogram[idx]++;
+
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal != 0)
+ zb->zb_ditto_samevdev++;
+ break;
+ }
+
}
if (BP_IS_EMBEDDED(bp)) {
zcb->zcb_readfails = 0;
- if (dump_opt['b'] < 5 &&
- gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+ /* only call gethrtime() every 100 blocks */
+ static int iters;
+ if (++iters > 100)
+ iters = 0;
+ else
+ return (0);
+
+ if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
uint64_t now = gethrtime();
char buf[10];
uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
(longlong_t)vd->vdev_ms_count);
msp->ms_ops = &zdb_metaslab_ops;
+
+ /*
+ * We don't want to spend the CPU
+ * manipulating the size-ordered
+ * tree, so clear the range_tree
+ * ops.
+ */
+ msp->ms_tree->rt_ops = NULL;
VERIFY0(space_map_load(msp->ms_sm,
msp->ms_tree, SM_ALLOC));
msp->ms_loaded = B_TRUE;
(void) printf("\n");
(void) printf("\tbp count: %10llu\n",
(u_longlong_t)tzb->zb_count);
+ (void) printf("\tganged count: %10llu\n",
+ (longlong_t)tzb->zb_gangs);
(void) printf("\tbp logical: %10llu avg: %6llu\n",
(u_longlong_t)tzb->zb_lsize,
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
}
}
+ if (tzb->zb_ditto_samevdev != 0) {
+ (void) printf("\tDittoed blocks on same vdev: %llu\n",
+ (longlong_t)tzb->zb_ditto_samevdev);
+ }
+
if (dump_opt['b'] >= 2) {
int l, t, level;
(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
for (t = 0; t <= ZDB_OT_TOTAL; t++) {
char csize[32], lsize[32], psize[32], asize[32];
- char avg[32];
+ char avg[32], gang[32];
char *typename;
if (t < DMU_OT_NUMTYPES)
zdb_nicenum(zb->zb_psize, psize);
zdb_nicenum(zb->zb_asize, asize);
zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
+ zdb_nicenum(zb->zb_gangs, gang);
(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
"\t%5.2f\t%6.2f\t",
(void) printf(" L%d %s\n",
level, typename);
+ if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+ (void) printf("\t number of ganged "
+ "blocks: %s\n", gang);
+ }
+
if (dump_opt['b'] >= 4) {
(void) printf("psize "
"(in 512-byte sectors): "
dump_metaslab_groups(spa);
if (dump_opt['d'] || dump_opt['i']) {
+ uint64_t refcount;
+
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
- dump_bpobj(&spa->spa_deferred_bpobj,
+ dump_full_bpobj(&spa->spa_deferred_bpobj,
"Deferred frees", 0);
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
- dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
+ dump_full_bpobj(
+ &spa->spa_dsl_pool->dp_free_bpobj,
"Pool snapshot frees", 0);
}
}
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+ if (feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS],
+ &refcount) != ENOTSUP) {
+ if (num_large_blocks != refcount) {
+ (void) printf("large_blocks feature refcount "
+ "mismatch: expected %lld != actual %lld\n",
+ (longlong_t)num_large_blocks,
+ (longlong_t)refcount);
+ rc = 2;
+ } else {
+ (void) printf("Verified large_blocks feature "
+ "refcount is correct (%llu)\n",
+ (longlong_t)refcount);
+ }
+ }
}
- if (dump_opt['b'] || dump_opt['c'])
+ if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
rc = dump_block_stats(spa);
if (rc == 0)
int rewind = ZPOOL_NEVER_REWIND;
char *spa_config_path_env;
const char *opts = "bcdhilmMI:suCDRSAFLXevp:t:U:P";
+ boolean_t target_is_spa = B_TRUE;
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
#endif
+ /*
+ * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+ * "zdb -b" uses traversal prefetch which uses async reads.
+ * For good performance, let several of them be active at once.
+ */
+ zfs_vdev_async_read_max_active = 10;
+
kernel_init(FREAD);
- if ((g_zfs = libzfs_init()) == NULL)
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s", libzfs_error_init(errno));
return (1);
+ }
if (dump_all)
verbose = MAX(verbose, 1);
}
}
+ if (strpbrk(target, "/@") != NULL) {
+ size_t targetlen;
+
+ target_is_spa = B_FALSE;
+ targetlen = strlen(target);
+ if (targetlen && target[targetlen - 1] == '/')
+ target[targetlen - 1] = '\0';
+ }
+
if (error == 0) {
- if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
+ if (target_is_spa || dump_opt['R']) {
error = spa_open_rewind(target, &spa, FTAG, policy,
NULL);
if (error) {