#include <sys/dsl_crypt.h>
#include <sys/dsl_scan.h>
#include <zfs_comutil.h>
-#include <libzfs.h>
+
+#include <libnvpair.h>
+#include <libzutil.h>
#include "zdb.h"
extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
extern int zfs_vdev_async_read_max_active;
extern boolean_t spa_load_verify_dryrun;
+extern int zfs_reconstruct_indirect_combinations_max;
static const char cmdname[] = "zdb";
uint8_t dump_opt[256];
uint64_t *zopt_object = NULL;
static unsigned zopt_objects = 0;
-libzfs_handle_t *g_zfs;
uint64_t max_inflight = 1000;
static int leaked_objects = 0;
static range_tree_t *mos_refd_objs;
"dump all read blocks into specified directory\n");
(void) fprintf(stderr, " -X attempt extreme rewind (does not "
"work with dataset)\n");
+ (void) fprintf(stderr, " -Y attempt all reconstruction "
+ "combinations for split blocks\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n");
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
{
if (dump_opt['G']) {
(void) printf("\n");
+ (void) fflush(stdout);
zfs_dbgmsg_print("zdb");
}
}
return;
(void) printf("space map object %llu:\n",
- (longlong_t)sm->sm_phys->smp_object);
- (void) printf(" smp_objsize = 0x%llx\n",
- (longlong_t)sm->sm_phys->smp_objsize);
+ (longlong_t)sm->sm_object);
+ (void) printf(" smp_length = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_length);
(void) printf(" smp_alloc = 0x%llx\n",
(longlong_t)sm->sm_phys->smp_alloc);
+ if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+ return;
+
/*
* Print out the freelist entries in both encoded and decoded form.
*/
uint8_t mapshift = sm->sm_shift;
int64_t alloc = 0;
- uint64_t word;
+ uint64_t word, entry_id = 0;
for (uint64_t offset = 0; offset < space_map_length(sm);
offset += sizeof (word)) {
sizeof (word), &word, DMU_READ_PREFETCH));
if (sm_entry_is_debug(word)) {
- (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
- (u_longlong_t)(offset / sizeof (word)),
+ (void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
+ (u_longlong_t)entry_id,
ddata[SM_DEBUG_ACTION_DECODE(word)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(word),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+ entry_id++;
continue;
}
(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
- (u_longlong_t)(offset / sizeof (word)),
+ (u_longlong_t)entry_id,
entry_type, (u_longlong_t)entry_off,
(u_longlong_t)(entry_off + entry_run),
(u_longlong_t)entry_run,
alloc += entry_run;
else
alloc -= entry_run;
+ entry_id++;
}
if ((uint64_t)alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%lld) INCONSISTENT "
if (dump_opt['m'] > 2 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
- metaslab_load_wait(msp);
- if (!msp->ms_loaded) {
- VERIFY0(metaslab_load(msp));
- range_tree_stat_verify(msp->ms_allocatable);
- }
+ VERIFY0(metaslab_load(msp));
+ range_tree_stat_verify(msp->ms_allocatable);
dump_metaslab_stats(msp);
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
- if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
- ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
-
- dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
- }
+ ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+ dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
}
static void
dmu_objset_name(os, osname);
(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
- "%s, %llu objects%s\n",
+ "%s, %llu objects%s%s\n",
osname, type, (u_longlong_t)dmu_objset_id(os),
(u_longlong_t)dds.dds_creation_txg,
- numbuf, (u_longlong_t)usedobjs, blkbuf);
+ numbuf, (u_longlong_t)usedobjs, blkbuf,
+ (dds.dds_inconsistent) ? " (inconsistent)" : "");
if (zopt_objects != 0) {
for (i = 0; i < zopt_objects; i++)
(void) printf("\tPercent empty: %10lf\n",
(double)(max_slot_used - total_slots_used)*100 /
(double)max_slot_used);
-
- ASSERT3U(object_count, ==, usedobjs);
-
(void) printf("\n");
if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
}
+
+ ASSERT3U(object_count, ==, usedobjs);
+
if (leaked_objects != 0) {
(void) printf("%d potentially leaked objects detected\n",
leaked_objects);
static void
zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
{
+ if (dump_opt['L'])
+ return;
+
if (spa->spa_vdev_removal == NULL)
return;
space_map_t *prev_obsolete_sm = NULL;
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
prev_obsolete_sm);
space_map_close(prev_obsolete_sm);
int error;
int p;
+ ASSERT(!dump_opt['L']);
+
bzero(&ddb, sizeof (ddb));
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
zcb->zcb_dedup_blocks++;
}
}
- if (!dump_opt['L']) {
- ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
- ddt_enter(ddt);
- VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
- ddt_exit(ddt);
- }
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
}
ASSERT(error == ENOENT);
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
- space_map_update(checkpoint_sm);
VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
checkpoint_sm_exclude_entry_cb, &cseea));
space_map_close(checkpoint_sm);
static void
zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
{
+ ASSERT(!dump_opt['L']);
+
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
static void
zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
{
+ ASSERT(!dump_opt['L']);
+
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
{
zcb->zcb_spa = spa;
- if (!dump_opt['L']) {
- dsl_pool_t *dp = spa->spa_dsl_pool;
- vdev_t *rvd = spa->spa_root_vdev;
+ if (dump_opt['L'])
+ return;
- /*
- * We are going to be changing the meaning of the metaslab's
- * ms_allocatable. Ensure that the allocator doesn't try to
- * use the tree.
- */
- spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
- spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ vdev_t *rvd = spa->spa_root_vdev;
- zcb->zcb_vd_obsolete_counts =
- umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
- UMEM_NOFAIL);
+ /*
+ * We are going to be changing the meaning of the metaslab's
+ * ms_allocatable. Ensure that the allocator doesn't try to
+ * use the tree.
+ */
+ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
- /*
- * For leak detection, we overload the ms_allocatable trees
- * to contain allocated segments instead of free segments.
- * As a result, we can't use the normal metaslab_load/unload
- * interfaces.
- */
- zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
- load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+ zcb->zcb_vd_obsolete_counts =
+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+ UMEM_NOFAIL);
- /*
- * On load_concrete_ms_allocatable_trees() we loaded all the
- * allocated entries from the ms_sm to the ms_allocatable for
- * each metaslab. If the pool has a checkpoint or is in the
- * middle of discarding a checkpoint, some of these blocks
- * may have been freed but their ms_sm may not have been
- * updated because they are referenced by the checkpoint. In
- * order to avoid false-positives during leak-detection, we
- * go through the vdev's checkpoint space map and exclude all
- * its entries from their relevant ms_allocatable.
- *
- * We also aggregate the space held by the checkpoint and add
- * it to zcb_checkpoint_size.
- *
- * Note that at this point we are also verifying that all the
- * entries on the checkpoint_sm are marked as allocated in
- * the ms_sm of their relevant metaslab.
- * [see comment in checkpoint_sm_exclude_entry_cb()]
- */
- zdb_leak_init_exclude_checkpoint(spa, zcb);
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
- /* for cleaner progress output */
- (void) fprintf(stderr, "\n");
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
+ ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
- if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
- ASSERT(spa_feature_is_enabled(spa,
- SPA_FEATURE_DEVICE_REMOVAL));
- (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
- increment_indirect_mapping_cb, zcb, NULL);
- }
- } else {
- /*
- * If leak tracing is disabled, we still need to consider
- * any checkpointed space in our space verification.
- */
- zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+ increment_indirect_mapping_cb, zcb, NULL);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
static boolean_t
zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
{
+ if (dump_opt['L'])
+ return (B_FALSE);
+
boolean_t leaks = B_FALSE;
- if (!dump_opt['L']) {
- vdev_t *rvd = spa->spa_root_vdev;
- for (unsigned c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- ASSERTV(metaslab_group_t *mg = vd->vdev_mg);
-
- if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
- leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
- }
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ ASSERTV(metaslab_group_t *mg = vd->vdev_mg);
- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- ASSERT3P(mg, ==, msp->ms_group);
+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+ leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+ }
- /*
- * ms_allocatable has been overloaded
- * to contain allocated segments. Now that
- * we finished traversing all blocks, any
- * block that remains in the ms_allocatable
- * represents an allocated block that we
- * did not claim during the traversal.
- * Claimed blocks would have been removed
- * from the ms_allocatable. For indirect
- * vdevs, space remaining in the tree
- * represents parts of the mapping that are
- * not referenced, which is not a bug.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- range_tree_vacate(msp->ms_allocatable,
- NULL, NULL);
- } else {
- range_tree_vacate(msp->ms_allocatable,
- zdb_leak, vd);
- }
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(mg, ==, msp->ms_group);
- if (msp->ms_loaded)
- msp->ms_loaded = B_FALSE;
+ /*
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
+ * represents an allocated block that we
+ * did not claim during the traversal.
+ * Claimed blocks would have been removed
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ range_tree_vacate(msp->ms_allocatable,
+ NULL, NULL);
+ } else {
+ range_tree_vacate(msp->ms_allocatable,
+ zdb_leak, vd);
}
- }
- umem_free(zcb->zcb_vd_obsolete_counts,
- rvd->vdev_children * sizeof (uint32_t *));
- zcb->zcb_vd_obsolete_counts = NULL;
+ if (msp->ms_loaded) {
+ msp->ms_loaded = B_FALSE;
+ }
+ }
}
+
+ umem_free(zcb->zcb_vd_obsolete_counts,
+ rvd->vdev_children * sizeof (uint32_t *));
+ zcb->zcb_vd_obsolete_counts = NULL;
+
return (leaks);
}
!dump_opt['L'] ? "nothing leaked " : "");
/*
- * Load all space maps as SM_ALLOC maps, then traverse the pool
- * claiming each block we discover. If the pool is perfectly
- * consistent, the space maps will be empty when we're done.
- * Anything left over is a leak; any block we can't claim (because
- * it's not part of any space map) is a double allocation,
- * reference to a freed block, or an unclaimed log block.
+ * When leak detection is enabled we load all space maps as SM_ALLOC
+ * maps, then traverse the pool claiming each block we discover. If
+ * the pool is perfectly consistent, the segment trees will be empty
+ * when we're done. Anything left over is a leak; any block we can't
+ * claim (because it's not part of any space map) is a double
+ * allocation, reference to a freed block, or an unclaimed log block.
+ *
+ * When leak detection is disabled (-L option) we still traverse the
+ * pool claiming each block we discover, but we skip opening any space
+ * maps.
*/
bzero(&zcb, sizeof (zdb_cb_t));
zdb_leak_init(spa, &zcb);
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
- if (total_found == total_alloc) {
- if (!dump_opt['L'])
- (void) printf("\n\tNo leaks (block sum matches space"
- " maps exactly)\n");
- } else {
+ if (total_found == total_alloc && !dump_opt['L']) {
+ (void) printf("\n\tNo leaks (block sum matches space"
+ " maps exactly)\n");
+ } else if (!dump_opt['L']) {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
(u_longlong_t)total_found,
spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object,
0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
(void) printf("\n");
space_map_close(prev_obsolete_sm);
* their respective ms_allocateable trees should not contain them.
*/
mutex_enter(&ms->ms_lock);
- range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);
return (0);
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
checkpoint_sm_obj, 0, current_vd->vdev_asize,
current_vd->vdev_ashift));
- space_map_update(checkpoint_sm);
verify_checkpoint_sm_entry_cb_arg_t vcsec;
vcsec.vcsec_vd = ckpoint_vd;
vcsec.vcsec_num_entries =
space_map_length(checkpoint_sm) / sizeof (uint64_t);
VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
verify_checkpoint_sm_entry_cb, &vcsec));
if (dump_opt['m'] > 3)
dump_spacemap(current->spa_meta_objset, checkpoint_sm);
* are part of the checkpoint were freed by mistake.
*/
range_tree_walk(ckpoint_msp->ms_allocatable,
- (range_tree_func_t *)range_tree_verify,
+ (range_tree_func_t *)range_tree_verify_not_present,
current_msp->ms_allocatable);
}
}
static void
verify_checkpoint_blocks(spa_t *spa)
{
+ ASSERT(!dump_opt['L']);
+
spa_t *checkpoint_spa;
char *checkpoint_pool;
nvlist_t *config = NULL;
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
- space_map_update(checkpoint_sm);
dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
}
spa_config_path = spa_config_path_env;
while ((c = getopt(argc, argv,
- "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+ "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) {
switch (c) {
case 'b':
case 'c':
case 'X':
dump_opt[c]++;
break;
+ case 'Y':
+ zfs_reconstruct_indirect_combinations_max = INT_MAX;
+ zfs_deadman_enabled = 0;
+ break;
/* NB: Sort single match options below. */
case 'I':
max_inflight = strtoull(optarg, NULL, 0);
spa_load_verify_dryrun = B_TRUE;
kernel_init(FREAD);
- if ((g_zfs = libzfs_init()) == NULL) {
- (void) fprintf(stderr, "%s", libzfs_error_init(errno));
- return (1);
- }
if (dump_all)
verbose = MAX(verbose, 1);
error = 0;
target = argv[0];
- char *checkpoint_pool = NULL;
- char *checkpoint_target = NULL;
- if (dump_opt['k']) {
- checkpoint_pool = import_checkpointed_state(target, cfg,
- &checkpoint_target);
-
- if (checkpoint_target != NULL)
- target = checkpoint_target;
-
- }
-
if (strpbrk(target, "/@") != NULL) {
size_t targetlen;
args.path = searchdirs;
args.can_be_active = B_TRUE;
- error = zpool_tryimport(g_zfs, target_pool, &cfg, &args);
+ error = zpool_find_config(NULL, target_pool, &cfg, &args,
+ &libzpool_config_ops);
if (error == 0) {
}
}
+ /*
+ * import_checkpointed_state makes the assumption that the
+ * target pool that we pass it is already part of the spa
+ * namespace. Because of that we need to make sure to call
+ * it always after the -e option has been processed, which
+ * imports the pool to the namespace if it's not in the
+ * cachefile.
+ */
+ char *checkpoint_pool = NULL;
+ char *checkpoint_target = NULL;
+ if (dump_opt['k']) {
+ checkpoint_pool = import_checkpointed_state(target, cfg,
+ &checkpoint_target);
+
+ if (checkpoint_target != NULL)
+ target = checkpoint_target;
+ }
+
if (target_pool != target)
free(target_pool);
dump_debug_buffer();
- libzfs_fini(g_zfs);
kernel_fini();
return (error);