]> git.proxmox.com Git - mirror_zfs.git/blob - cmd/zdb/zdb.c
zdb: add -B option to generate backup stream
[mirror_zfs.git] / cmd / zdb / zdb.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2016 Nexenta Systems, Inc.
27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
28 * Copyright (c) 2015, 2017, Intel Corporation.
29 * Copyright (c) 2020 Datto Inc.
30 * Copyright (c) 2020, The FreeBSD Foundation [1]
31 *
32 * [1] Portions of this software were developed by Allan Jude
33 * under sponsorship from the FreeBSD Foundation.
34 * Copyright (c) 2021 Allan Jude
35 * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
36 * Copyright (c) 2023, Klara Inc.
37 */
38
39 #include <stdio.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <ctype.h>
43 #include <getopt.h>
44 #include <openssl/evp.h>
45 #include <sys/zfs_context.h>
46 #include <sys/spa.h>
47 #include <sys/spa_impl.h>
48 #include <sys/dmu.h>
49 #include <sys/zap.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/zfs_znode.h>
52 #include <sys/zfs_sa.h>
53 #include <sys/sa.h>
54 #include <sys/sa_impl.h>
55 #include <sys/vdev.h>
56 #include <sys/vdev_impl.h>
57 #include <sys/metaslab_impl.h>
58 #include <sys/dmu_objset.h>
59 #include <sys/dsl_dir.h>
60 #include <sys/dsl_dataset.h>
61 #include <sys/dsl_pool.h>
62 #include <sys/dsl_bookmark.h>
63 #include <sys/dbuf.h>
64 #include <sys/zil.h>
65 #include <sys/zil_impl.h>
66 #include <sys/stat.h>
67 #include <sys/resource.h>
68 #include <sys/dmu_send.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/zio_checksum.h>
71 #include <sys/zio_compress.h>
72 #include <sys/zfs_fuid.h>
73 #include <sys/arc.h>
74 #include <sys/arc_impl.h>
75 #include <sys/ddt.h>
76 #include <sys/zfeature.h>
77 #include <sys/abd.h>
78 #include <sys/blkptr.h>
79 #include <sys/dsl_crypt.h>
80 #include <sys/dsl_scan.h>
81 #include <sys/btree.h>
82 #include <zfs_comutil.h>
83 #include <sys/zstd/zstd.h>
84
85 #include <libnvpair.h>
86 #include <libzutil.h>
87
88 #include "zdb.h"
89
90 #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
91 zio_compress_table[(idx)].ci_name : "UNKNOWN")
92 #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
93 zio_checksum_table[(idx)].ci_name : "UNKNOWN")
94 #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
95 (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \
96 DMU_OT_ZAP_OTHER : \
97 (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
98 DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
99
100 /* Some platforms require part of inode IDs to be remapped */
101 #ifdef __APPLE__
102 #define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2)
103 #else
104 #define ZDB_MAP_OBJECT_ID(obj) (obj)
105 #endif
106
107 static const char *
108 zdb_ot_name(dmu_object_type_t type)
109 {
110 if (type < DMU_OT_NUMTYPES)
111 return (dmu_ot[type].ot_name);
112 else if ((type & DMU_OT_NEWTYPE) &&
113 ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS))
114 return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name);
115 else
116 return ("UNKNOWN");
117 }
118
119 extern int reference_tracking_enable;
120 extern int zfs_recover;
121 extern uint_t zfs_vdev_async_read_max_active;
122 extern boolean_t spa_load_verify_dryrun;
123 extern boolean_t spa_mode_readable_spacemaps;
124 extern uint_t zfs_reconstruct_indirect_combinations_max;
125 extern uint_t zfs_btree_verify_intensity;
126
127 static const char cmdname[] = "zdb";
128 uint8_t dump_opt[256];
129
130 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
131
132 static uint64_t *zopt_metaslab = NULL;
133 static unsigned zopt_metaslab_args = 0;
134
135 typedef struct zopt_object_range {
136 uint64_t zor_obj_start;
137 uint64_t zor_obj_end;
138 uint64_t zor_flags;
139 } zopt_object_range_t;
140
141 static zopt_object_range_t *zopt_object_ranges = NULL;
142 static unsigned zopt_object_args = 0;
143
144 static int flagbits[256];
145
146 #define ZOR_FLAG_PLAIN_FILE 0x0001
147 #define ZOR_FLAG_DIRECTORY 0x0002
148 #define ZOR_FLAG_SPACE_MAP 0x0004
149 #define ZOR_FLAG_ZAP 0x0008
150 #define ZOR_FLAG_ALL_TYPES -1
151 #define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \
152 ZOR_FLAG_DIRECTORY | \
153 ZOR_FLAG_SPACE_MAP | \
154 ZOR_FLAG_ZAP)
155
156 #define ZDB_FLAG_CHECKSUM 0x0001
157 #define ZDB_FLAG_DECOMPRESS 0x0002
158 #define ZDB_FLAG_BSWAP 0x0004
159 #define ZDB_FLAG_GBH 0x0008
160 #define ZDB_FLAG_INDIRECT 0x0010
161 #define ZDB_FLAG_RAW 0x0020
162 #define ZDB_FLAG_PRINT_BLKPTR 0x0040
163 #define ZDB_FLAG_VERBOSE 0x0080
164
165 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
166 static int leaked_objects = 0;
167 static range_tree_t *mos_refd_objs;
168
169 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
170 boolean_t);
171 static void mos_obj_refd(uint64_t);
172 static void mos_obj_refd_multiple(uint64_t);
173 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
174 dmu_tx_t *tx);
175
176 typedef struct sublivelist_verify {
177 /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
178 zfs_btree_t sv_pair;
179
180 /* ALLOC's without a matching FREE, accumulates across sub-livelists */
181 zfs_btree_t sv_leftover;
182 } sublivelist_verify_t;
183
184 static int
185 livelist_compare(const void *larg, const void *rarg)
186 {
187 const blkptr_t *l = larg;
188 const blkptr_t *r = rarg;
189
190 /* Sort them according to dva[0] */
191 uint64_t l_dva0_vdev, r_dva0_vdev;
192 l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
193 r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
194 if (l_dva0_vdev < r_dva0_vdev)
195 return (-1);
196 else if (l_dva0_vdev > r_dva0_vdev)
197 return (+1);
198
199 /* if vdevs are equal, sort by offsets. */
200 uint64_t l_dva0_offset;
201 uint64_t r_dva0_offset;
202 l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
203 r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
204 if (l_dva0_offset < r_dva0_offset) {
205 return (-1);
206 } else if (l_dva0_offset > r_dva0_offset) {
207 return (+1);
208 }
209
210 /*
211 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
212 * it's possible the offsets are equal. In that case, sort by txg
213 */
214 if (l->blk_birth < r->blk_birth) {
215 return (-1);
216 } else if (l->blk_birth > r->blk_birth) {
217 return (+1);
218 }
219 return (0);
220 }
221
222 typedef struct sublivelist_verify_block {
223 dva_t svb_dva;
224
225 /*
226 * We need this to check if the block marked as allocated
227 * in the livelist was freed (and potentially reallocated)
228 * in the metaslab spacemaps at a later TXG.
229 */
230 uint64_t svb_allocated_txg;
231 } sublivelist_verify_block_t;
232
233 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
234
235 typedef struct sublivelist_verify_block_refcnt {
236 /* block pointer entry in livelist being verified */
237 blkptr_t svbr_blk;
238
239 /*
240 * Refcount gets incremented to 1 when we encounter the first
241 * FREE entry for the svfbr block pointer and a node for it
242 * is created in our ZDB verification/tracking metadata.
243 *
244 * As we encounter more FREE entries we increment this counter
245 * and similarly decrement it whenever we find the respective
246 * ALLOC entries for this block.
247 *
248 * When the refcount gets to 0 it means that all the FREE and
249 * ALLOC entries of this block have paired up and we no longer
250 * need to track it in our verification logic (e.g. the node
251 * containing this struct in our verification data structure
252 * should be freed).
253 *
254 * [refer to sublivelist_verify_blkptr() for the actual code]
255 */
256 uint32_t svbr_refcnt;
257 } sublivelist_verify_block_refcnt_t;
258
259 static int
260 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
261 {
262 const sublivelist_verify_block_refcnt_t *l = larg;
263 const sublivelist_verify_block_refcnt_t *r = rarg;
264 return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
265 }
266
267 static int
268 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
269 dmu_tx_t *tx)
270 {
271 ASSERT3P(tx, ==, NULL);
272 struct sublivelist_verify *sv = arg;
273 sublivelist_verify_block_refcnt_t current = {
274 .svbr_blk = *bp,
275
276 /*
277 * Start with 1 in case this is the first free entry.
278 * This field is not used for our B-Tree comparisons
279 * anyway.
280 */
281 .svbr_refcnt = 1,
282 };
283
284 zfs_btree_index_t where;
285 sublivelist_verify_block_refcnt_t *pair =
286 zfs_btree_find(&sv->sv_pair, &current, &where);
287 if (free) {
288 if (pair == NULL) {
289 /* first free entry for this block pointer */
290 zfs_btree_add(&sv->sv_pair, &current);
291 } else {
292 pair->svbr_refcnt++;
293 }
294 } else {
295 if (pair == NULL) {
296 /* block that is currently marked as allocated */
297 for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
298 if (DVA_IS_EMPTY(&bp->blk_dva[i]))
299 break;
300 sublivelist_verify_block_t svb = {
301 .svb_dva = bp->blk_dva[i],
302 .svb_allocated_txg = bp->blk_birth
303 };
304
305 if (zfs_btree_find(&sv->sv_leftover, &svb,
306 &where) == NULL) {
307 zfs_btree_add_idx(&sv->sv_leftover,
308 &svb, &where);
309 }
310 }
311 } else {
312 /* alloc matches a free entry */
313 pair->svbr_refcnt--;
314 if (pair->svbr_refcnt == 0) {
315 /* all allocs and frees have been matched */
316 zfs_btree_remove_idx(&sv->sv_pair, &where);
317 }
318 }
319 }
320
321 return (0);
322 }
323
324 static int
325 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
326 {
327 int err;
328 struct sublivelist_verify *sv = args;
329
330 zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
331 sizeof (sublivelist_verify_block_refcnt_t));
332
333 err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
334 sv, NULL);
335
336 sublivelist_verify_block_refcnt_t *e;
337 zfs_btree_index_t *cookie = NULL;
338 while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
339 char blkbuf[BP_SPRINTF_LEN];
340 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
341 &e->svbr_blk, B_TRUE);
342 (void) printf("\tERROR: %d unmatched FREE(s): %s\n",
343 e->svbr_refcnt, blkbuf);
344 }
345 zfs_btree_destroy(&sv->sv_pair);
346
347 return (err);
348 }
349
350 static int
351 livelist_block_compare(const void *larg, const void *rarg)
352 {
353 const sublivelist_verify_block_t *l = larg;
354 const sublivelist_verify_block_t *r = rarg;
355
356 if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
357 return (-1);
358 else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
359 return (+1);
360
361 if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
362 return (-1);
363 else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
364 return (+1);
365
366 if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
367 return (-1);
368 else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
369 return (+1);
370
371 return (0);
372 }
373
374 /*
375 * Check for errors in a livelist while tracking all unfreed ALLOCs in the
376 * sublivelist_verify_t: sv->sv_leftover
377 */
378 static void
379 livelist_verify(dsl_deadlist_t *dl, void *arg)
380 {
381 sublivelist_verify_t *sv = arg;
382 dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
383 }
384
385 /*
386 * Check for errors in the livelist entry and discard the intermediary
387 * data structures
388 */
389 static int
390 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
391 {
392 (void) args;
393 sublivelist_verify_t sv;
394 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
395 sizeof (sublivelist_verify_block_t));
396 int err = sublivelist_verify_func(&sv, dle);
397 zfs_btree_clear(&sv.sv_leftover);
398 zfs_btree_destroy(&sv.sv_leftover);
399 return (err);
400 }
401
402 typedef struct metaslab_verify {
403 /*
404 * Tree containing all the leftover ALLOCs from the livelists
405 * that are part of this metaslab.
406 */
407 zfs_btree_t mv_livelist_allocs;
408
409 /*
410 * Metaslab information.
411 */
412 uint64_t mv_vdid;
413 uint64_t mv_msid;
414 uint64_t mv_start;
415 uint64_t mv_end;
416
417 /*
418 * What's currently allocated for this metaslab.
419 */
420 range_tree_t *mv_allocated;
421 } metaslab_verify_t;
422
423 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
424
425 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
426 void *arg);
427
428 typedef struct unflushed_iter_cb_arg {
429 spa_t *uic_spa;
430 uint64_t uic_txg;
431 void *uic_arg;
432 zdb_log_sm_cb_t uic_cb;
433 } unflushed_iter_cb_arg_t;
434
435 static int
436 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
437 {
438 unflushed_iter_cb_arg_t *uic = arg;
439 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
440 }
441
442 static void
443 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
444 {
445 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
446 return;
447
448 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
449 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
450 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
451 space_map_t *sm = NULL;
452 VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
453 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
454
455 unflushed_iter_cb_arg_t uic = {
456 .uic_spa = spa,
457 .uic_txg = sls->sls_txg,
458 .uic_arg = arg,
459 .uic_cb = cb
460 };
461 VERIFY0(space_map_iterate(sm, space_map_length(sm),
462 iterate_through_spacemap_logs_cb, &uic));
463 space_map_close(sm);
464 }
465 spa_config_exit(spa, SCL_CONFIG, FTAG);
466 }
467
468 static void
469 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
470 uint64_t offset, uint64_t size)
471 {
472 sublivelist_verify_block_t svb = {{{0}}};
473 DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
474 DVA_SET_OFFSET(&svb.svb_dva, offset);
475 DVA_SET_ASIZE(&svb.svb_dva, size);
476 zfs_btree_index_t where;
477 uint64_t end_offset = offset + size;
478
479 /*
480 * Look for an exact match for spacemap entry in the livelist entries.
481 * Then, look for other livelist entries that fall within the range
482 * of the spacemap entry as it may have been condensed
483 */
484 sublivelist_verify_block_t *found =
485 zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
486 if (found == NULL) {
487 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
488 }
489 for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
490 DVA_GET_OFFSET(&found->svb_dva) < end_offset;
491 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
492 if (found->svb_allocated_txg <= txg) {
493 (void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
494 "from TXG %llx FREED at TXG %llx\n",
495 (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
496 (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
497 (u_longlong_t)found->svb_allocated_txg,
498 (u_longlong_t)txg);
499 }
500 }
501 }
502
503 static int
504 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
505 {
506 metaslab_verify_t *mv = arg;
507 uint64_t offset = sme->sme_offset;
508 uint64_t size = sme->sme_run;
509 uint64_t txg = sme->sme_txg;
510
511 if (sme->sme_type == SM_ALLOC) {
512 if (range_tree_contains(mv->mv_allocated,
513 offset, size)) {
514 (void) printf("ERROR: DOUBLE ALLOC: "
515 "%llu [%llx:%llx] "
516 "%llu:%llu LOG_SM\n",
517 (u_longlong_t)txg, (u_longlong_t)offset,
518 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
519 (u_longlong_t)mv->mv_msid);
520 } else {
521 range_tree_add(mv->mv_allocated,
522 offset, size);
523 }
524 } else {
525 if (!range_tree_contains(mv->mv_allocated,
526 offset, size)) {
527 (void) printf("ERROR: DOUBLE FREE: "
528 "%llu [%llx:%llx] "
529 "%llu:%llu LOG_SM\n",
530 (u_longlong_t)txg, (u_longlong_t)offset,
531 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
532 (u_longlong_t)mv->mv_msid);
533 } else {
534 range_tree_remove(mv->mv_allocated,
535 offset, size);
536 }
537 }
538
539 if (sme->sme_type != SM_ALLOC) {
540 /*
541 * If something is freed in the spacemap, verify that
542 * it is not listed as allocated in the livelist.
543 */
544 verify_livelist_allocs(mv, txg, offset, size);
545 }
546 return (0);
547 }
548
549 static int
550 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
551 uint64_t txg, void *arg)
552 {
553 metaslab_verify_t *mv = arg;
554 uint64_t offset = sme->sme_offset;
555 uint64_t vdev_id = sme->sme_vdev;
556
557 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
558
559 /* skip indirect vdevs */
560 if (!vdev_is_concrete(vd))
561 return (0);
562
563 if (vdev_id != mv->mv_vdid)
564 return (0);
565
566 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
567 if (ms->ms_id != mv->mv_msid)
568 return (0);
569
570 if (txg < metaslab_unflushed_txg(ms))
571 return (0);
572
573
574 ASSERT3U(txg, ==, sme->sme_txg);
575 return (metaslab_spacemap_validation_cb(sme, mv));
576 }
577
578 static void
579 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
580 {
581 iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
582 }
583
584 static void
585 spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv)
586 {
587 if (sm == NULL)
588 return;
589
590 VERIFY0(space_map_iterate(sm, space_map_length(sm),
591 metaslab_spacemap_validation_cb, mv));
592 }
593
594 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
595
596 /*
597 * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
598 * they are part of that metaslab (mv_msid).
599 */
600 static void
601 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
602 {
603 zfs_btree_index_t where;
604 sublivelist_verify_block_t *svb;
605 ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
606 for (svb = zfs_btree_first(&sv->sv_leftover, &where);
607 svb != NULL;
608 svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
609 if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
610 continue;
611
612 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
613 (DVA_GET_OFFSET(&svb->svb_dva) +
614 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
615 (void) printf("ERROR: Found block that crosses "
616 "metaslab boundary: <%llu:%llx:%llx>\n",
617 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
618 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
619 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
620 continue;
621 }
622
623 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
624 continue;
625
626 if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
627 continue;
628
629 if ((DVA_GET_OFFSET(&svb->svb_dva) +
630 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
631 (void) printf("ERROR: Found block that crosses "
632 "metaslab boundary: <%llu:%llx:%llx>\n",
633 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
634 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
635 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
636 continue;
637 }
638
639 zfs_btree_add(&mv->mv_livelist_allocs, svb);
640 }
641
642 for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
643 svb != NULL;
644 svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
645 zfs_btree_remove(&sv->sv_leftover, svb);
646 }
647 }
648
649 /*
650 * [Livelist Check]
651 * Iterate through all the sublivelists and:
652 * - report leftover frees (**)
653 * - record leftover ALLOCs together with their TXG [see Cross Check]
654 *
655 * (**) Note: Double ALLOCs are valid in datasets that have dedup
656 * enabled. Similarly double FREEs are allowed as well but
657 * only if they pair up with a corresponding ALLOC entry once
658 * we our done with our sublivelist iteration.
659 *
660 * [Spacemap Check]
661 * for each metaslab:
662 * - iterate over spacemap and then the metaslab's entries in the
663 * spacemap log, then report any double FREEs and ALLOCs (do not
664 * blow up).
665 *
666 * [Cross Check]
667 * After finishing the Livelist Check phase and while being in the
668 * Spacemap Check phase, we find all the recorded leftover ALLOCs
669 * of the livelist check that are part of the metaslab that we are
670 * currently looking at in the Spacemap Check. We report any entries
671 * that are marked as ALLOCs in the livelists but have been actually
672 * freed (and potentially allocated again) after their TXG stamp in
673 * the spacemaps. Also report any ALLOCs from the livelists that
674 * belong to indirect vdevs (e.g. their vdev completed removal).
675 *
676 * Note that this will miss Log Spacemap entries that cancelled each other
677 * out before being flushed to the metaslab, so we are not guaranteed
678 * to match all erroneous ALLOCs.
679 */
680 static void
681 livelist_metaslab_validate(spa_t *spa)
682 {
683 (void) printf("Verifying deleted livelist entries\n");
684
685 sublivelist_verify_t sv;
686 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
687 sizeof (sublivelist_verify_block_t));
688 iterate_deleted_livelists(spa, livelist_verify, &sv);
689
690 (void) printf("Verifying metaslab entries\n");
691 vdev_t *rvd = spa->spa_root_vdev;
692 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
693 vdev_t *vd = rvd->vdev_child[c];
694
695 if (!vdev_is_concrete(vd))
696 continue;
697
698 for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
699 metaslab_t *m = vd->vdev_ms[mid];
700
701 (void) fprintf(stderr,
702 "\rverifying concrete vdev %llu, "
703 "metaslab %llu of %llu ...",
704 (longlong_t)vd->vdev_id,
705 (longlong_t)mid,
706 (longlong_t)vd->vdev_ms_count);
707
708 uint64_t shift, start;
709 range_seg_type_t type =
710 metaslab_calculate_range_tree_type(vd, m,
711 &start, &shift);
712 metaslab_verify_t mv;
713 mv.mv_allocated = range_tree_create(NULL,
714 type, NULL, start, shift);
715 mv.mv_vdid = vd->vdev_id;
716 mv.mv_msid = m->ms_id;
717 mv.mv_start = m->ms_start;
718 mv.mv_end = m->ms_start + m->ms_size;
719 zfs_btree_create(&mv.mv_livelist_allocs,
720 livelist_block_compare, NULL,
721 sizeof (sublivelist_verify_block_t));
722
723 mv_populate_livelist_allocs(&mv, &sv);
724
725 spacemap_check_ms_sm(m->ms_sm, &mv);
726 spacemap_check_sm_log(spa, &mv);
727
728 range_tree_vacate(mv.mv_allocated, NULL, NULL);
729 range_tree_destroy(mv.mv_allocated);
730 zfs_btree_clear(&mv.mv_livelist_allocs);
731 zfs_btree_destroy(&mv.mv_livelist_allocs);
732 }
733 }
734 (void) fprintf(stderr, "\n");
735
736 /*
737 * If there are any segments in the leftover tree after we walked
738 * through all the metaslabs in the concrete vdevs then this means
739 * that we have segments in the livelists that belong to indirect
740 * vdevs and are marked as allocated.
741 */
742 if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
743 zfs_btree_destroy(&sv.sv_leftover);
744 return;
745 }
746 (void) printf("ERROR: Found livelist blocks marked as allocated "
747 "for indirect vdevs:\n");
748
749 zfs_btree_index_t *where = NULL;
750 sublivelist_verify_block_t *svb;
751 while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
752 NULL) {
753 int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
754 ASSERT3U(vdev_id, <, rvd->vdev_children);
755 vdev_t *vd = rvd->vdev_child[vdev_id];
756 ASSERT(!vdev_is_concrete(vd));
757 (void) printf("<%d:%llx:%llx> TXG %llx\n",
758 vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
759 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
760 (u_longlong_t)svb->svb_allocated_txg);
761 }
762 (void) printf("\n");
763 zfs_btree_destroy(&sv.sv_leftover);
764 }
765
766 /*
767 * These libumem hooks provide a reasonable set of defaults for the allocator's
768 * debugging facilities.
769 */
770 const char *
771 _umem_debug_init(void)
772 {
773 return ("default,verbose"); /* $UMEM_DEBUG setting */
774 }
775
776 const char *
777 _umem_logging_init(void)
778 {
779 return ("fail,contents"); /* $UMEM_LOGGING setting */
780 }
781
782 static void
783 usage(void)
784 {
785 (void) fprintf(stderr,
786 "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
787 "[-I <inflight I/Os>]\n"
788 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
789 "\t\t[-K <key>]\n"
790 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
791 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
792 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
793 "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
794 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
795 "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
796 "\t%s [-v] <bookmark>\n"
797 "\t%s -C [-A] [-U <cache>]\n"
798 "\t%s -l [-Aqu] <device>\n"
799 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
800 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
801 "\t%s -O [-K <key>] <dataset> <path>\n"
802 "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
803 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
804 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
805 "\t%s -E [-A] word0:word1:...:word15\n"
806 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
807 "<poolname>\n\n",
808 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
809 cmdname, cmdname, cmdname, cmdname, cmdname);
810
811 (void) fprintf(stderr, " Dataset name must include at least one "
812 "separator character '/' or '@'\n");
813 (void) fprintf(stderr, " If dataset name is specified, only that "
814 "dataset is dumped\n");
815 (void) fprintf(stderr, " If object numbers or object number "
816 "ranges are specified, only those\n"
817 " objects or ranges are dumped.\n\n");
818 (void) fprintf(stderr,
819 " Object ranges take the form <start>:<end>[:<flags>]\n"
820 " start Starting object number\n"
821 " end Ending object number, or -1 for no upper bound\n"
822 " flags Optional flags to select object types:\n"
823 " A All objects (this is the default)\n"
824 " d ZFS directories\n"
825 " f ZFS files \n"
826 " m SPA space maps\n"
827 " z ZAPs\n"
828 " - Negate effect of next flag\n\n");
829 (void) fprintf(stderr, " Options to control amount of output:\n");
830 (void) fprintf(stderr, " -b --block-stats "
831 "block statistics\n");
832 (void) fprintf(stderr, " -B --backup "
833 "backup stream\n");
834 (void) fprintf(stderr, " -c --checksum "
835 "checksum all metadata (twice for all data) blocks\n");
836 (void) fprintf(stderr, " -C --config "
837 "config (or cachefile if alone)\n");
838 (void) fprintf(stderr, " -d --datasets "
839 "dataset(s)\n");
840 (void) fprintf(stderr, " -D --dedup-stats "
841 "dedup statistics\n");
842 (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n"
843 " decode and display block "
844 "from an embedded block pointer\n");
845 (void) fprintf(stderr, " -h --history "
846 "pool history\n");
847 (void) fprintf(stderr, " -i --intent-logs "
848 "intent logs\n");
849 (void) fprintf(stderr, " -l --label "
850 "read label contents\n");
851 (void) fprintf(stderr, " -k --checkpointed-state "
852 "examine the checkpointed state of the pool\n");
853 (void) fprintf(stderr, " -L --disable-leak-tracking "
854 "disable leak tracking (do not load spacemaps)\n");
855 (void) fprintf(stderr, " -m --metaslabs "
856 "metaslabs\n");
857 (void) fprintf(stderr, " -M --metaslab-groups "
858 "metaslab groups\n");
859 (void) fprintf(stderr, " -O --object-lookups "
860 "perform object lookups by path\n");
861 (void) fprintf(stderr, " -r --copy-object "
862 "copy an object by path to file\n");
863 (void) fprintf(stderr, " -R --read-block "
864 "read and display block from a device\n");
865 (void) fprintf(stderr, " -s --io-stats "
866 "report stats on zdb's I/O\n");
867 (void) fprintf(stderr, " -S --simulate-dedup "
868 "simulate dedup to measure effect\n");
869 (void) fprintf(stderr, " -v --verbose "
870 "verbose (applies to all others)\n");
871 (void) fprintf(stderr, " -y --livelist "
872 "perform livelist and metaslab validation on any livelists being "
873 "deleted\n\n");
874 (void) fprintf(stderr, " Below options are intended for use "
875 "with other options:\n");
876 (void) fprintf(stderr, " -A --ignore-assertions "
877 "ignore assertions (-A), enable panic recovery (-AA) or both "
878 "(-AAA)\n");
879 (void) fprintf(stderr, " -e --exported "
880 "pool is exported/destroyed/has altroot/not in a cachefile\n");
881 (void) fprintf(stderr, " -F --automatic-rewind "
882 "attempt automatic rewind within safe range of transaction "
883 "groups\n");
884 (void) fprintf(stderr, " -G --dump-debug-msg "
885 "dump zfs_dbgmsg buffer before exiting\n");
886 (void) fprintf(stderr, " -I --inflight=INTEGER "
887 "specify the maximum number of checksumming I/Os "
888 "[default is 200]\n");
889 (void) fprintf(stderr, " -K --key=KEY "
890 "decryption key for encrypted dataset\n");
891 (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" "
892 "set global variable to an unsigned 32-bit integer\n");
893 (void) fprintf(stderr, " -p --path==PATH "
894 "use one or more with -e to specify path to vdev dir\n");
895 (void) fprintf(stderr, " -P --parseable "
896 "print numbers in parseable form\n");
897 (void) fprintf(stderr, " -q --skip-label "
898 "don't print label contents\n");
899 (void) fprintf(stderr, " -t --txg=INTEGER "
900 "highest txg to use when searching for uberblocks\n");
901 (void) fprintf(stderr, " -u --uberblock "
902 "uberblock\n");
903 (void) fprintf(stderr, " -U --cachefile=PATH "
904 "use alternate cachefile\n");
905 (void) fprintf(stderr, " -V --verbatim "
906 "do verbatim import\n");
907 (void) fprintf(stderr, " -x --dump-blocks=PATH "
908 "dump all read blocks into specified directory\n");
909 (void) fprintf(stderr, " -X --extreme-rewind "
910 "attempt extreme rewind (does not work with dataset)\n");
911 (void) fprintf(stderr, " -Y --all-reconstruction "
912 "attempt all reconstruction combinations for split blocks\n");
913 (void) fprintf(stderr, " -Z --zstd-headers "
914 "show ZSTD headers \n");
915 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
916 "to make only that option verbose\n");
917 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
918 exit(1);
919 }
920
921 static void
922 dump_debug_buffer(void)
923 {
924 if (dump_opt['G']) {
925 (void) printf("\n");
926 (void) fflush(stdout);
927 zfs_dbgmsg_print("zdb");
928 }
929 }
930
931 /*
932 * Called for usage errors that are discovered after a call to spa_open(),
933 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
934 */
935
936 static void
937 fatal(const char *fmt, ...)
938 {
939 va_list ap;
940
941 va_start(ap, fmt);
942 (void) fprintf(stderr, "%s: ", cmdname);
943 (void) vfprintf(stderr, fmt, ap);
944 va_end(ap);
945 (void) fprintf(stderr, "\n");
946
947 dump_debug_buffer();
948
949 exit(1);
950 }
951
952 static void
953 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
954 {
955 (void) size;
956 nvlist_t *nv;
957 size_t nvsize = *(uint64_t *)data;
958 char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
959
960 VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
961
962 VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
963
964 umem_free(packed, nvsize);
965
966 dump_nvlist(nv, 8);
967
968 nvlist_free(nv);
969 }
970
971 static void
972 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
973 {
974 (void) os, (void) object, (void) size;
975 spa_history_phys_t *shp = data;
976
977 if (shp == NULL)
978 return;
979
980 (void) printf("\t\tpool_create_len = %llu\n",
981 (u_longlong_t)shp->sh_pool_create_len);
982 (void) printf("\t\tphys_max_off = %llu\n",
983 (u_longlong_t)shp->sh_phys_max_off);
984 (void) printf("\t\tbof = %llu\n",
985 (u_longlong_t)shp->sh_bof);
986 (void) printf("\t\teof = %llu\n",
987 (u_longlong_t)shp->sh_eof);
988 (void) printf("\t\trecords_lost = %llu\n",
989 (u_longlong_t)shp->sh_records_lost);
990 }
991
992 static void
993 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
994 {
995 if (dump_opt['P'])
996 (void) snprintf(buf, buflen, "%llu", (longlong_t)num);
997 else
998 nicenum(num, buf, buflen);
999 }
1000
1001 static const char histo_stars[] = "****************************************";
1002 static const uint64_t histo_width = sizeof (histo_stars) - 1;
1003
1004 static void
1005 dump_histogram(const uint64_t *histo, int size, int offset)
1006 {
1007 int i;
1008 int minidx = size - 1;
1009 int maxidx = 0;
1010 uint64_t max = 0;
1011
1012 for (i = 0; i < size; i++) {
1013 if (histo[i] == 0)
1014 continue;
1015 if (histo[i] > max)
1016 max = histo[i];
1017 if (i > maxidx)
1018 maxidx = i;
1019 if (i < minidx)
1020 minidx = i;
1021 }
1022
1023 if (max < histo_width)
1024 max = histo_width;
1025
1026 for (i = minidx; i <= maxidx; i++) {
1027 (void) printf("\t\t\t%3u: %6llu %s\n",
1028 i + offset, (u_longlong_t)histo[i],
1029 &histo_stars[(max - histo[i]) * histo_width / max]);
1030 }
1031 }
1032
1033 static void
1034 dump_zap_stats(objset_t *os, uint64_t object)
1035 {
1036 int error;
1037 zap_stats_t zs;
1038
1039 error = zap_get_stats(os, object, &zs);
1040 if (error)
1041 return;
1042
1043 if (zs.zs_ptrtbl_len == 0) {
1044 ASSERT(zs.zs_num_blocks == 1);
1045 (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
1046 (u_longlong_t)zs.zs_blocksize,
1047 (u_longlong_t)zs.zs_num_entries);
1048 return;
1049 }
1050
1051 (void) printf("\tFat ZAP stats:\n");
1052
1053 (void) printf("\t\tPointer table:\n");
1054 (void) printf("\t\t\t%llu elements\n",
1055 (u_longlong_t)zs.zs_ptrtbl_len);
1056 (void) printf("\t\t\tzt_blk: %llu\n",
1057 (u_longlong_t)zs.zs_ptrtbl_zt_blk);
1058 (void) printf("\t\t\tzt_numblks: %llu\n",
1059 (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
1060 (void) printf("\t\t\tzt_shift: %llu\n",
1061 (u_longlong_t)zs.zs_ptrtbl_zt_shift);
1062 (void) printf("\t\t\tzt_blks_copied: %llu\n",
1063 (u_longlong_t)zs.zs_ptrtbl_blks_copied);
1064 (void) printf("\t\t\tzt_nextblk: %llu\n",
1065 (u_longlong_t)zs.zs_ptrtbl_nextblk);
1066
1067 (void) printf("\t\tZAP entries: %llu\n",
1068 (u_longlong_t)zs.zs_num_entries);
1069 (void) printf("\t\tLeaf blocks: %llu\n",
1070 (u_longlong_t)zs.zs_num_leafs);
1071 (void) printf("\t\tTotal blocks: %llu\n",
1072 (u_longlong_t)zs.zs_num_blocks);
1073 (void) printf("\t\tzap_block_type: 0x%llx\n",
1074 (u_longlong_t)zs.zs_block_type);
1075 (void) printf("\t\tzap_magic: 0x%llx\n",
1076 (u_longlong_t)zs.zs_magic);
1077 (void) printf("\t\tzap_salt: 0x%llx\n",
1078 (u_longlong_t)zs.zs_salt);
1079
1080 (void) printf("\t\tLeafs with 2^n pointers:\n");
1081 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
1082
1083 (void) printf("\t\tBlocks with n*5 entries:\n");
1084 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
1085
1086 (void) printf("\t\tBlocks n/10 full:\n");
1087 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
1088
1089 (void) printf("\t\tEntries with n chunks:\n");
1090 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
1091
1092 (void) printf("\t\tBuckets with n entries:\n");
1093 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
1094 }
1095
1096 static void
1097 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
1098 {
1099 (void) os, (void) object, (void) data, (void) size;
1100 }
1101
1102 static void
1103 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
1104 {
1105 (void) os, (void) object, (void) data, (void) size;
1106 (void) printf("\tUNKNOWN OBJECT TYPE\n");
1107 }
1108
1109 static void
1110 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
1111 {
1112 (void) os, (void) object, (void) data, (void) size;
1113 }
1114
1115 static void
1116 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
1117 {
1118 uint64_t *arr;
1119 uint64_t oursize;
1120 if (dump_opt['d'] < 6)
1121 return;
1122
1123 if (data == NULL) {
1124 dmu_object_info_t doi;
1125
1126 VERIFY0(dmu_object_info(os, object, &doi));
1127 size = doi.doi_max_offset;
1128 /*
1129 * We cap the size at 1 mebibyte here to prevent
1130 * allocation failures and nigh-infinite printing if the
1131 * object is extremely large.
1132 */
1133 oursize = MIN(size, 1 << 20);
1134 arr = kmem_alloc(oursize, KM_SLEEP);
1135
1136 int err = dmu_read(os, object, 0, oursize, arr, 0);
1137 if (err != 0) {
1138 (void) printf("got error %u from dmu_read\n", err);
1139 kmem_free(arr, oursize);
1140 return;
1141 }
1142 } else {
1143 /*
1144 * Even though the allocation is already done in this code path,
1145 * we still cap the size to prevent excessive printing.
1146 */
1147 oursize = MIN(size, 1 << 20);
1148 arr = data;
1149 }
1150
1151 if (size == 0) {
1152 if (data == NULL)
1153 kmem_free(arr, oursize);
1154 (void) printf("\t\t[]\n");
1155 return;
1156 }
1157
1158 (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
1159 for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
1160 if (i % 4 != 0)
1161 (void) printf(", %0llx", (u_longlong_t)arr[i]);
1162 else
1163 (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
1164 }
1165 if (oursize != size)
1166 (void) printf(", ... ");
1167 (void) printf("]\n");
1168
1169 if (data == NULL)
1170 kmem_free(arr, oursize);
1171 }
1172
1173 static void
1174 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
1175 {
1176 (void) data, (void) size;
1177 zap_cursor_t zc;
1178 zap_attribute_t attr;
1179 void *prop;
1180 unsigned i;
1181
1182 dump_zap_stats(os, object);
1183 (void) printf("\n");
1184
1185 for (zap_cursor_init(&zc, os, object);
1186 zap_cursor_retrieve(&zc, &attr) == 0;
1187 zap_cursor_advance(&zc)) {
1188 (void) printf("\t\t%s = ", attr.za_name);
1189 if (attr.za_num_integers == 0) {
1190 (void) printf("\n");
1191 continue;
1192 }
1193 prop = umem_zalloc(attr.za_num_integers *
1194 attr.za_integer_length, UMEM_NOFAIL);
1195 (void) zap_lookup(os, object, attr.za_name,
1196 attr.za_integer_length, attr.za_num_integers, prop);
1197 if (attr.za_integer_length == 1) {
1198 if (strcmp(attr.za_name,
1199 DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
1200 strcmp(attr.za_name,
1201 DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
1202 strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 ||
1203 strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
1204 strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) {
1205 uint8_t *u8 = prop;
1206
1207 for (i = 0; i < attr.za_num_integers; i++) {
1208 (void) printf("%02x", u8[i]);
1209 }
1210 } else {
1211 (void) printf("%s", (char *)prop);
1212 }
1213 } else {
1214 for (i = 0; i < attr.za_num_integers; i++) {
1215 switch (attr.za_integer_length) {
1216 case 2:
1217 (void) printf("%u ",
1218 ((uint16_t *)prop)[i]);
1219 break;
1220 case 4:
1221 (void) printf("%u ",
1222 ((uint32_t *)prop)[i]);
1223 break;
1224 case 8:
1225 (void) printf("%lld ",
1226 (u_longlong_t)((int64_t *)prop)[i]);
1227 break;
1228 }
1229 }
1230 }
1231 (void) printf("\n");
1232 umem_free(prop, attr.za_num_integers * attr.za_integer_length);
1233 }
1234 zap_cursor_fini(&zc);
1235 }
1236
1237 static void
1238 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
1239 {
1240 bpobj_phys_t *bpop = data;
1241 uint64_t i;
1242 char bytes[32], comp[32], uncomp[32];
1243
1244 /* make sure the output won't get truncated */
1245 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
1246 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
1247 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
1248
1249 if (bpop == NULL)
1250 return;
1251
1252 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
1253 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
1254 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
1255
1256 (void) printf("\t\tnum_blkptrs = %llu\n",
1257 (u_longlong_t)bpop->bpo_num_blkptrs);
1258 (void) printf("\t\tbytes = %s\n", bytes);
1259 if (size >= BPOBJ_SIZE_V1) {
1260 (void) printf("\t\tcomp = %s\n", comp);
1261 (void) printf("\t\tuncomp = %s\n", uncomp);
1262 }
1263 if (size >= BPOBJ_SIZE_V2) {
1264 (void) printf("\t\tsubobjs = %llu\n",
1265 (u_longlong_t)bpop->bpo_subobjs);
1266 (void) printf("\t\tnum_subobjs = %llu\n",
1267 (u_longlong_t)bpop->bpo_num_subobjs);
1268 }
1269 if (size >= sizeof (*bpop)) {
1270 (void) printf("\t\tnum_freed = %llu\n",
1271 (u_longlong_t)bpop->bpo_num_freed);
1272 }
1273
1274 if (dump_opt['d'] < 5)
1275 return;
1276
1277 for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
1278 char blkbuf[BP_SPRINTF_LEN];
1279 blkptr_t bp;
1280
1281 int err = dmu_read(os, object,
1282 i * sizeof (bp), sizeof (bp), &bp, 0);
1283 if (err != 0) {
1284 (void) printf("got error %u from dmu_read\n", err);
1285 break;
1286 }
1287 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
1288 BP_GET_FREE(&bp));
1289 (void) printf("\t%s\n", blkbuf);
1290 }
1291 }
1292
1293 static void
1294 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
1295 {
1296 (void) data, (void) size;
1297 dmu_object_info_t doi;
1298 int64_t i;
1299
1300 VERIFY0(dmu_object_info(os, object, &doi));
1301 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
1302
1303 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
1304 if (err != 0) {
1305 (void) printf("got error %u from dmu_read\n", err);
1306 kmem_free(subobjs, doi.doi_max_offset);
1307 return;
1308 }
1309
1310 int64_t last_nonzero = -1;
1311 for (i = 0; i < doi.doi_max_offset / 8; i++) {
1312 if (subobjs[i] != 0)
1313 last_nonzero = i;
1314 }
1315
1316 for (i = 0; i <= last_nonzero; i++) {
1317 (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
1318 }
1319 kmem_free(subobjs, doi.doi_max_offset);
1320 }
1321
1322 static void
1323 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
1324 {
1325 (void) data, (void) size;
1326 dump_zap_stats(os, object);
1327 /* contents are printed elsewhere, properly decoded */
1328 }
1329
1330 static void
1331 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
1332 {
1333 (void) data, (void) size;
1334 zap_cursor_t zc;
1335 zap_attribute_t attr;
1336
1337 dump_zap_stats(os, object);
1338 (void) printf("\n");
1339
1340 for (zap_cursor_init(&zc, os, object);
1341 zap_cursor_retrieve(&zc, &attr) == 0;
1342 zap_cursor_advance(&zc)) {
1343 (void) printf("\t\t%s = ", attr.za_name);
1344 if (attr.za_num_integers == 0) {
1345 (void) printf("\n");
1346 continue;
1347 }
1348 (void) printf(" %llx : [%d:%d:%d]\n",
1349 (u_longlong_t)attr.za_first_integer,
1350 (int)ATTR_LENGTH(attr.za_first_integer),
1351 (int)ATTR_BSWAP(attr.za_first_integer),
1352 (int)ATTR_NUM(attr.za_first_integer));
1353 }
1354 zap_cursor_fini(&zc);
1355 }
1356
1357 static void
1358 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
1359 {
1360 (void) data, (void) size;
1361 zap_cursor_t zc;
1362 zap_attribute_t attr;
1363 uint16_t *layout_attrs;
1364 unsigned i;
1365
1366 dump_zap_stats(os, object);
1367 (void) printf("\n");
1368
1369 for (zap_cursor_init(&zc, os, object);
1370 zap_cursor_retrieve(&zc, &attr) == 0;
1371 zap_cursor_advance(&zc)) {
1372 (void) printf("\t\t%s = [", attr.za_name);
1373 if (attr.za_num_integers == 0) {
1374 (void) printf("\n");
1375 continue;
1376 }
1377
1378 VERIFY(attr.za_integer_length == 2);
1379 layout_attrs = umem_zalloc(attr.za_num_integers *
1380 attr.za_integer_length, UMEM_NOFAIL);
1381
1382 VERIFY(zap_lookup(os, object, attr.za_name,
1383 attr.za_integer_length,
1384 attr.za_num_integers, layout_attrs) == 0);
1385
1386 for (i = 0; i != attr.za_num_integers; i++)
1387 (void) printf(" %d ", (int)layout_attrs[i]);
1388 (void) printf("]\n");
1389 umem_free(layout_attrs,
1390 attr.za_num_integers * attr.za_integer_length);
1391 }
1392 zap_cursor_fini(&zc);
1393 }
1394
1395 static void
1396 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
1397 {
1398 (void) data, (void) size;
1399 zap_cursor_t zc;
1400 zap_attribute_t attr;
1401 const char *typenames[] = {
1402 /* 0 */ "not specified",
1403 /* 1 */ "FIFO",
1404 /* 2 */ "Character Device",
1405 /* 3 */ "3 (invalid)",
1406 /* 4 */ "Directory",
1407 /* 5 */ "5 (invalid)",
1408 /* 6 */ "Block Device",
1409 /* 7 */ "7 (invalid)",
1410 /* 8 */ "Regular File",
1411 /* 9 */ "9 (invalid)",
1412 /* 10 */ "Symbolic Link",
1413 /* 11 */ "11 (invalid)",
1414 /* 12 */ "Socket",
1415 /* 13 */ "Door",
1416 /* 14 */ "Event Port",
1417 /* 15 */ "15 (invalid)",
1418 };
1419
1420 dump_zap_stats(os, object);
1421 (void) printf("\n");
1422
1423 for (zap_cursor_init(&zc, os, object);
1424 zap_cursor_retrieve(&zc, &attr) == 0;
1425 zap_cursor_advance(&zc)) {
1426 (void) printf("\t\t%s = %lld (type: %s)\n",
1427 attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
1428 typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
1429 }
1430 zap_cursor_fini(&zc);
1431 }
1432
1433 static int
1434 get_dtl_refcount(vdev_t *vd)
1435 {
1436 int refcount = 0;
1437
1438 if (vd->vdev_ops->vdev_op_leaf) {
1439 space_map_t *sm = vd->vdev_dtl_sm;
1440
1441 if (sm != NULL &&
1442 sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
1443 return (1);
1444 return (0);
1445 }
1446
1447 for (unsigned c = 0; c < vd->vdev_children; c++)
1448 refcount += get_dtl_refcount(vd->vdev_child[c]);
1449 return (refcount);
1450 }
1451
1452 static int
1453 get_metaslab_refcount(vdev_t *vd)
1454 {
1455 int refcount = 0;
1456
1457 if (vd->vdev_top == vd) {
1458 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1459 space_map_t *sm = vd->vdev_ms[m]->ms_sm;
1460
1461 if (sm != NULL &&
1462 sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
1463 refcount++;
1464 }
1465 }
1466 for (unsigned c = 0; c < vd->vdev_children; c++)
1467 refcount += get_metaslab_refcount(vd->vdev_child[c]);
1468
1469 return (refcount);
1470 }
1471
1472 static int
1473 get_obsolete_refcount(vdev_t *vd)
1474 {
1475 uint64_t obsolete_sm_object;
1476 int refcount = 0;
1477
1478 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1479 if (vd->vdev_top == vd && obsolete_sm_object != 0) {
1480 dmu_object_info_t doi;
1481 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
1482 obsolete_sm_object, &doi));
1483 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
1484 refcount++;
1485 }
1486 } else {
1487 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
1488 ASSERT3U(obsolete_sm_object, ==, 0);
1489 }
1490 for (unsigned c = 0; c < vd->vdev_children; c++) {
1491 refcount += get_obsolete_refcount(vd->vdev_child[c]);
1492 }
1493
1494 return (refcount);
1495 }
1496
1497 static int
1498 get_prev_obsolete_spacemap_refcount(spa_t *spa)
1499 {
1500 uint64_t prev_obj =
1501 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
1502 if (prev_obj != 0) {
1503 dmu_object_info_t doi;
1504 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
1505 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
1506 return (1);
1507 }
1508 }
1509 return (0);
1510 }
1511
1512 static int
1513 get_checkpoint_refcount(vdev_t *vd)
1514 {
1515 int refcount = 0;
1516
1517 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
1518 zap_contains(spa_meta_objset(vd->vdev_spa),
1519 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
1520 refcount++;
1521
1522 for (uint64_t c = 0; c < vd->vdev_children; c++)
1523 refcount += get_checkpoint_refcount(vd->vdev_child[c]);
1524
1525 return (refcount);
1526 }
1527
1528 static int
1529 get_log_spacemap_refcount(spa_t *spa)
1530 {
1531 return (avl_numnodes(&spa->spa_sm_logs_by_txg));
1532 }
1533
1534 static int
1535 verify_spacemap_refcounts(spa_t *spa)
1536 {
1537 uint64_t expected_refcount = 0;
1538 uint64_t actual_refcount;
1539
1540 (void) feature_get_refcount(spa,
1541 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
1542 &expected_refcount);
1543 actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
1544 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
1545 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
1546 actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
1547 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
1548 actual_refcount += get_log_spacemap_refcount(spa);
1549
1550 if (expected_refcount != actual_refcount) {
1551 (void) printf("space map refcount mismatch: expected %lld != "
1552 "actual %lld\n",
1553 (longlong_t)expected_refcount,
1554 (longlong_t)actual_refcount);
1555 return (2);
1556 }
1557 return (0);
1558 }
1559
1560 static void
1561 dump_spacemap(objset_t *os, space_map_t *sm)
1562 {
1563 const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
1564 "INVALID", "INVALID", "INVALID", "INVALID" };
1565
1566 if (sm == NULL)
1567 return;
1568
1569 (void) printf("space map object %llu:\n",
1570 (longlong_t)sm->sm_object);
1571 (void) printf(" smp_length = 0x%llx\n",
1572 (longlong_t)sm->sm_phys->smp_length);
1573 (void) printf(" smp_alloc = 0x%llx\n",
1574 (longlong_t)sm->sm_phys->smp_alloc);
1575
1576 if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
1577 return;
1578
1579 /*
1580 * Print out the freelist entries in both encoded and decoded form.
1581 */
1582 uint8_t mapshift = sm->sm_shift;
1583 int64_t alloc = 0;
1584 uint64_t word, entry_id = 0;
1585 for (uint64_t offset = 0; offset < space_map_length(sm);
1586 offset += sizeof (word)) {
1587
1588 VERIFY0(dmu_read(os, space_map_object(sm), offset,
1589 sizeof (word), &word, DMU_READ_PREFETCH));
1590
1591 if (sm_entry_is_debug(word)) {
1592 uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
1593 uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
1594 if (de_txg == 0) {
1595 (void) printf(
1596 "\t [%6llu] PADDING\n",
1597 (u_longlong_t)entry_id);
1598 } else {
1599 (void) printf(
1600 "\t [%6llu] %s: txg %llu pass %llu\n",
1601 (u_longlong_t)entry_id,
1602 ddata[SM_DEBUG_ACTION_DECODE(word)],
1603 (u_longlong_t)de_txg,
1604 (u_longlong_t)de_sync_pass);
1605 }
1606 entry_id++;
1607 continue;
1608 }
1609
1610 uint8_t words;
1611 char entry_type;
1612 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
1613
1614 if (sm_entry_is_single_word(word)) {
1615 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
1616 'A' : 'F';
1617 entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
1618 sm->sm_start;
1619 entry_run = SM_RUN_DECODE(word) << mapshift;
1620 words = 1;
1621 } else {
1622 /* it is a two-word entry so we read another word */
1623 ASSERT(sm_entry_is_double_word(word));
1624
1625 uint64_t extra_word;
1626 offset += sizeof (extra_word);
1627 VERIFY0(dmu_read(os, space_map_object(sm), offset,
1628 sizeof (extra_word), &extra_word,
1629 DMU_READ_PREFETCH));
1630
1631 ASSERT3U(offset, <=, space_map_length(sm));
1632
1633 entry_run = SM2_RUN_DECODE(word) << mapshift;
1634 entry_vdev = SM2_VDEV_DECODE(word);
1635 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
1636 'A' : 'F';
1637 entry_off = (SM2_OFFSET_DECODE(extra_word) <<
1638 mapshift) + sm->sm_start;
1639 words = 2;
1640 }
1641
1642 (void) printf("\t [%6llu] %c range:"
1643 " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
1644 (u_longlong_t)entry_id,
1645 entry_type, (u_longlong_t)entry_off,
1646 (u_longlong_t)(entry_off + entry_run),
1647 (u_longlong_t)entry_run,
1648 (u_longlong_t)entry_vdev, words);
1649
1650 if (entry_type == 'A')
1651 alloc += entry_run;
1652 else
1653 alloc -= entry_run;
1654 entry_id++;
1655 }
1656 if (alloc != space_map_allocated(sm)) {
1657 (void) printf("space_map_object alloc (%lld) INCONSISTENT "
1658 "with space map summary (%lld)\n",
1659 (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
1660 }
1661 }
1662
1663 static void
1664 dump_metaslab_stats(metaslab_t *msp)
1665 {
1666 char maxbuf[32];
1667 range_tree_t *rt = msp->ms_allocatable;
1668 zfs_btree_t *t = &msp->ms_allocatable_by_size;
1669 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1670
1671 /* max sure nicenum has enough space */
1672 _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
1673
1674 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
1675
1676 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
1677 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
1678 "freepct", free_pct);
1679 (void) printf("\tIn-memory histogram:\n");
1680 dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1681 }
1682
1683 static void
1684 dump_metaslab(metaslab_t *msp)
1685 {
1686 vdev_t *vd = msp->ms_group->mg_vd;
1687 spa_t *spa = vd->vdev_spa;
1688 space_map_t *sm = msp->ms_sm;
1689 char freebuf[32];
1690
1691 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
1692 sizeof (freebuf));
1693
1694 (void) printf(
1695 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
1696 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
1697 (u_longlong_t)space_map_object(sm), freebuf);
1698
1699 if (dump_opt['m'] > 2 && !dump_opt['L']) {
1700 mutex_enter(&msp->ms_lock);
1701 VERIFY0(metaslab_load(msp));
1702 range_tree_stat_verify(msp->ms_allocatable);
1703 dump_metaslab_stats(msp);
1704 metaslab_unload(msp);
1705 mutex_exit(&msp->ms_lock);
1706 }
1707
1708 if (dump_opt['m'] > 1 && sm != NULL &&
1709 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
1710 /*
1711 * The space map histogram represents free space in chunks
1712 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
1713 */
1714 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
1715 (u_longlong_t)msp->ms_fragmentation);
1716 dump_histogram(sm->sm_phys->smp_histogram,
1717 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
1718 }
1719
1720 if (vd->vdev_ops == &vdev_draid_ops)
1721 ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
1722 else
1723 ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
1724
1725 dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
1726
1727 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
1728 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
1729 (u_longlong_t)metaslab_unflushed_txg(msp));
1730 }
1731 }
1732
1733 static void
1734 print_vdev_metaslab_header(vdev_t *vd)
1735 {
1736 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
1737 const char *bias_str = "";
1738 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
1739 bias_str = VDEV_ALLOC_BIAS_LOG;
1740 } else if (alloc_bias == VDEV_BIAS_SPECIAL) {
1741 bias_str = VDEV_ALLOC_BIAS_SPECIAL;
1742 } else if (alloc_bias == VDEV_BIAS_DEDUP) {
1743 bias_str = VDEV_ALLOC_BIAS_DEDUP;
1744 }
1745
1746 uint64_t ms_flush_data_obj = 0;
1747 if (vd->vdev_top_zap != 0) {
1748 int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
1749 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1750 sizeof (uint64_t), 1, &ms_flush_data_obj);
1751 if (error != ENOENT) {
1752 ASSERT0(error);
1753 }
1754 }
1755
1756 (void) printf("\tvdev %10llu %s",
1757 (u_longlong_t)vd->vdev_id, bias_str);
1758
1759 if (ms_flush_data_obj != 0) {
1760 (void) printf(" ms_unflushed_phys object %llu",
1761 (u_longlong_t)ms_flush_data_obj);
1762 }
1763
1764 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n",
1765 "metaslabs", (u_longlong_t)vd->vdev_ms_count,
1766 "offset", "spacemap", "free");
1767 (void) printf("\t%15s %19s %15s %12s\n",
1768 "---------------", "-------------------",
1769 "---------------", "------------");
1770 }
1771
1772 static void
1773 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
1774 {
1775 vdev_t *rvd = spa->spa_root_vdev;
1776 metaslab_class_t *mc = spa_normal_class(spa);
1777 metaslab_class_t *smc = spa_special_class(spa);
1778 uint64_t fragmentation;
1779
1780 metaslab_class_histogram_verify(mc);
1781
1782 for (unsigned c = 0; c < rvd->vdev_children; c++) {
1783 vdev_t *tvd = rvd->vdev_child[c];
1784 metaslab_group_t *mg = tvd->vdev_mg;
1785
1786 if (mg == NULL || (mg->mg_class != mc &&
1787 (!show_special || mg->mg_class != smc)))
1788 continue;
1789
1790 metaslab_group_histogram_verify(mg);
1791 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1792
1793 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
1794 "fragmentation",
1795 (u_longlong_t)tvd->vdev_id,
1796 (u_longlong_t)tvd->vdev_ms_count);
1797 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
1798 (void) printf("%3s\n", "-");
1799 } else {
1800 (void) printf("%3llu%%\n",
1801 (u_longlong_t)mg->mg_fragmentation);
1802 }
1803 dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1804 }
1805
1806 (void) printf("\tpool %s\tfragmentation", spa_name(spa));
1807 fragmentation = metaslab_class_fragmentation(mc);
1808 if (fragmentation == ZFS_FRAG_INVALID)
1809 (void) printf("\t%3s\n", "-");
1810 else
1811 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
1812 dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1813 }
1814
1815 static void
1816 print_vdev_indirect(vdev_t *vd)
1817 {
1818 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
1819 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1820 vdev_indirect_births_t *vib = vd->vdev_indirect_births;
1821
1822 if (vim == NULL) {
1823 ASSERT3P(vib, ==, NULL);
1824 return;
1825 }
1826
1827 ASSERT3U(vdev_indirect_mapping_object(vim), ==,
1828 vic->vic_mapping_object);
1829 ASSERT3U(vdev_indirect_births_object(vib), ==,
1830 vic->vic_births_object);
1831
1832 (void) printf("indirect births obj %llu:\n",
1833 (longlong_t)vic->vic_births_object);
1834 (void) printf(" vib_count = %llu\n",
1835 (longlong_t)vdev_indirect_births_count(vib));
1836 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
1837 vdev_indirect_birth_entry_phys_t *cur_vibe =
1838 &vib->vib_entries[i];
1839 (void) printf("\toffset %llx -> txg %llu\n",
1840 (longlong_t)cur_vibe->vibe_offset,
1841 (longlong_t)cur_vibe->vibe_phys_birth_txg);
1842 }
1843 (void) printf("\n");
1844
1845 (void) printf("indirect mapping obj %llu:\n",
1846 (longlong_t)vic->vic_mapping_object);
1847 (void) printf(" vim_max_offset = 0x%llx\n",
1848 (longlong_t)vdev_indirect_mapping_max_offset(vim));
1849 (void) printf(" vim_bytes_mapped = 0x%llx\n",
1850 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
1851 (void) printf(" vim_count = %llu\n",
1852 (longlong_t)vdev_indirect_mapping_num_entries(vim));
1853
1854 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
1855 return;
1856
1857 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
1858
1859 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
1860 vdev_indirect_mapping_entry_phys_t *vimep =
1861 &vim->vim_entries[i];
1862 (void) printf("\t<%llx:%llx:%llx> -> "
1863 "<%llx:%llx:%llx> (%x obsolete)\n",
1864 (longlong_t)vd->vdev_id,
1865 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
1866 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1867 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
1868 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
1869 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1870 counts[i]);
1871 }
1872 (void) printf("\n");
1873
1874 uint64_t obsolete_sm_object;
1875 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1876 if (obsolete_sm_object != 0) {
1877 objset_t *mos = vd->vdev_spa->spa_meta_objset;
1878 (void) printf("obsolete space map object %llu:\n",
1879 (u_longlong_t)obsolete_sm_object);
1880 ASSERT(vd->vdev_obsolete_sm != NULL);
1881 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
1882 obsolete_sm_object);
1883 dump_spacemap(mos, vd->vdev_obsolete_sm);
1884 (void) printf("\n");
1885 }
1886 }
1887
1888 static void
1889 dump_metaslabs(spa_t *spa)
1890 {
1891 vdev_t *vd, *rvd = spa->spa_root_vdev;
1892 uint64_t m, c = 0, children = rvd->vdev_children;
1893
1894 (void) printf("\nMetaslabs:\n");
1895
1896 if (!dump_opt['d'] && zopt_metaslab_args > 0) {
1897 c = zopt_metaslab[0];
1898
1899 if (c >= children)
1900 (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
1901
1902 if (zopt_metaslab_args > 1) {
1903 vd = rvd->vdev_child[c];
1904 print_vdev_metaslab_header(vd);
1905
1906 for (m = 1; m < zopt_metaslab_args; m++) {
1907 if (zopt_metaslab[m] < vd->vdev_ms_count)
1908 dump_metaslab(
1909 vd->vdev_ms[zopt_metaslab[m]]);
1910 else
1911 (void) fprintf(stderr, "bad metaslab "
1912 "number %llu\n",
1913 (u_longlong_t)zopt_metaslab[m]);
1914 }
1915 (void) printf("\n");
1916 return;
1917 }
1918 children = c + 1;
1919 }
1920 for (; c < children; c++) {
1921 vd = rvd->vdev_child[c];
1922 print_vdev_metaslab_header(vd);
1923
1924 print_vdev_indirect(vd);
1925
1926 for (m = 0; m < vd->vdev_ms_count; m++)
1927 dump_metaslab(vd->vdev_ms[m]);
1928 (void) printf("\n");
1929 }
1930 }
1931
1932 static void
1933 dump_log_spacemaps(spa_t *spa)
1934 {
1935 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1936 return;
1937
1938 (void) printf("\nLog Space Maps in Pool:\n");
1939 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1940 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1941 space_map_t *sm = NULL;
1942 VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
1943 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
1944
1945 (void) printf("Log Spacemap object %llu txg %llu\n",
1946 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
1947 dump_spacemap(spa->spa_meta_objset, sm);
1948 space_map_close(sm);
1949 }
1950 (void) printf("\n");
1951 }
1952
1953 static void
1954 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
1955 {
1956 const ddt_phys_t *ddp = dde->dde_phys;
1957 const ddt_key_t *ddk = &dde->dde_key;
1958 const char *types[4] = { "ditto", "single", "double", "triple" };
1959 char blkbuf[BP_SPRINTF_LEN];
1960 blkptr_t blk;
1961 int p;
1962
1963 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1964 if (ddp->ddp_phys_birth == 0)
1965 continue;
1966 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1967 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
1968 (void) printf("index %llx refcnt %llu %s %s\n",
1969 (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
1970 types[p], blkbuf);
1971 }
1972 }
1973
1974 static void
1975 dump_dedup_ratio(const ddt_stat_t *dds)
1976 {
1977 double rL, rP, rD, D, dedup, compress, copies;
1978
1979 if (dds->dds_blocks == 0)
1980 return;
1981
1982 rL = (double)dds->dds_ref_lsize;
1983 rP = (double)dds->dds_ref_psize;
1984 rD = (double)dds->dds_ref_dsize;
1985 D = (double)dds->dds_dsize;
1986
1987 dedup = rD / D;
1988 compress = rL / rP;
1989 copies = rD / rP;
1990
1991 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
1992 "dedup * compress / copies = %.2f\n\n",
1993 dedup, compress, copies, dedup * compress / copies);
1994 }
1995
1996 static void
1997 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
1998 {
1999 char name[DDT_NAMELEN];
2000 ddt_entry_t dde;
2001 uint64_t walk = 0;
2002 dmu_object_info_t doi;
2003 uint64_t count, dspace, mspace;
2004 int error;
2005
2006 error = ddt_object_info(ddt, type, class, &doi);
2007
2008 if (error == ENOENT)
2009 return;
2010 ASSERT(error == 0);
2011
2012 error = ddt_object_count(ddt, type, class, &count);
2013 ASSERT(error == 0);
2014 if (count == 0)
2015 return;
2016
2017 dspace = doi.doi_physical_blocks_512 << 9;
2018 mspace = doi.doi_fill_count * doi.doi_data_block_size;
2019
2020 ddt_object_name(ddt, type, class, name);
2021
2022 (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
2023 name,
2024 (u_longlong_t)count,
2025 (u_longlong_t)(dspace / count),
2026 (u_longlong_t)(mspace / count));
2027
2028 if (dump_opt['D'] < 3)
2029 return;
2030
2031 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
2032
2033 if (dump_opt['D'] < 4)
2034 return;
2035
2036 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
2037 return;
2038
2039 (void) printf("%s contents:\n\n", name);
2040
2041 while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
2042 dump_dde(ddt, &dde, walk);
2043
2044 ASSERT3U(error, ==, ENOENT);
2045
2046 (void) printf("\n");
2047 }
2048
2049 static void
2050 dump_all_ddts(spa_t *spa)
2051 {
2052 ddt_histogram_t ddh_total = {{{0}}};
2053 ddt_stat_t dds_total = {0};
2054
2055 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
2056 ddt_t *ddt = spa->spa_ddt[c];
2057 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
2058 for (enum ddt_class class = 0; class < DDT_CLASSES;
2059 class++) {
2060 dump_ddt(ddt, type, class);
2061 }
2062 }
2063 }
2064
2065 ddt_get_dedup_stats(spa, &dds_total);
2066
2067 if (dds_total.dds_blocks == 0) {
2068 (void) printf("All DDTs are empty\n");
2069 return;
2070 }
2071
2072 (void) printf("\n");
2073
2074 if (dump_opt['D'] > 1) {
2075 (void) printf("DDT histogram (aggregated over all DDTs):\n");
2076 ddt_get_dedup_histogram(spa, &ddh_total);
2077 zpool_dump_ddt(&dds_total, &ddh_total);
2078 }
2079
2080 dump_dedup_ratio(&dds_total);
2081 }
2082
2083 static void
2084 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
2085 {
2086 char *prefix = arg;
2087
2088 (void) printf("%s [%llu,%llu) length %llu\n",
2089 prefix,
2090 (u_longlong_t)start,
2091 (u_longlong_t)(start + size),
2092 (u_longlong_t)(size));
2093 }
2094
2095 static void
2096 dump_dtl(vdev_t *vd, int indent)
2097 {
2098 spa_t *spa = vd->vdev_spa;
2099 boolean_t required;
2100 const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
2101 "outage" };
2102 char prefix[256];
2103
2104 spa_vdev_state_enter(spa, SCL_NONE);
2105 required = vdev_dtl_required(vd);
2106 (void) spa_vdev_state_exit(spa, NULL, 0);
2107
2108 if (indent == 0)
2109 (void) printf("\nDirty time logs:\n\n");
2110
2111 (void) printf("\t%*s%s [%s]\n", indent, "",
2112 vd->vdev_path ? vd->vdev_path :
2113 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
2114 required ? "DTL-required" : "DTL-expendable");
2115
2116 for (int t = 0; t < DTL_TYPES; t++) {
2117 range_tree_t *rt = vd->vdev_dtl[t];
2118 if (range_tree_space(rt) == 0)
2119 continue;
2120 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
2121 indent + 2, "", name[t]);
2122 range_tree_walk(rt, dump_dtl_seg, prefix);
2123 if (dump_opt['d'] > 5 && vd->vdev_children == 0)
2124 dump_spacemap(spa->spa_meta_objset,
2125 vd->vdev_dtl_sm);
2126 }
2127
2128 for (unsigned c = 0; c < vd->vdev_children; c++)
2129 dump_dtl(vd->vdev_child[c], indent + 4);
2130 }
2131
2132 static void
2133 dump_history(spa_t *spa)
2134 {
2135 nvlist_t **events = NULL;
2136 char *buf;
2137 uint64_t resid, len, off = 0;
2138 uint_t num = 0;
2139 int error;
2140 char tbuf[30];
2141
2142 if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
2143 (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
2144 __func__);
2145 return;
2146 }
2147
2148 do {
2149 len = SPA_OLD_MAXBLOCKSIZE;
2150
2151 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
2152 (void) fprintf(stderr, "Unable to read history: "
2153 "error %d\n", error);
2154 free(buf);
2155 return;
2156 }
2157
2158 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
2159 break;
2160
2161 off -= resid;
2162 } while (len != 0);
2163
2164 (void) printf("\nHistory:\n");
2165 for (unsigned i = 0; i < num; i++) {
2166 boolean_t printed = B_FALSE;
2167
2168 if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
2169 time_t tsec;
2170 struct tm t;
2171
2172 tsec = fnvlist_lookup_uint64(events[i],
2173 ZPOOL_HIST_TIME);
2174 (void) localtime_r(&tsec, &t);
2175 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
2176 } else {
2177 tbuf[0] = '\0';
2178 }
2179
2180 if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
2181 (void) printf("%s %s\n", tbuf,
2182 fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
2183 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
2184 uint64_t ievent;
2185
2186 ievent = fnvlist_lookup_uint64(events[i],
2187 ZPOOL_HIST_INT_EVENT);
2188 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
2189 goto next;
2190
2191 (void) printf(" %s [internal %s txg:%ju] %s\n",
2192 tbuf,
2193 zfs_history_event_names[ievent],
2194 fnvlist_lookup_uint64(events[i],
2195 ZPOOL_HIST_TXG),
2196 fnvlist_lookup_string(events[i],
2197 ZPOOL_HIST_INT_STR));
2198 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
2199 (void) printf("%s [txg:%ju] %s", tbuf,
2200 fnvlist_lookup_uint64(events[i],
2201 ZPOOL_HIST_TXG),
2202 fnvlist_lookup_string(events[i],
2203 ZPOOL_HIST_INT_NAME));
2204
2205 if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
2206 (void) printf(" %s (%llu)",
2207 fnvlist_lookup_string(events[i],
2208 ZPOOL_HIST_DSNAME),
2209 (u_longlong_t)fnvlist_lookup_uint64(
2210 events[i],
2211 ZPOOL_HIST_DSID));
2212 }
2213
2214 (void) printf(" %s\n", fnvlist_lookup_string(events[i],
2215 ZPOOL_HIST_INT_STR));
2216 } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
2217 (void) printf("%s ioctl %s\n", tbuf,
2218 fnvlist_lookup_string(events[i],
2219 ZPOOL_HIST_IOCTL));
2220
2221 if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
2222 (void) printf(" input:\n");
2223 dump_nvlist(fnvlist_lookup_nvlist(events[i],
2224 ZPOOL_HIST_INPUT_NVL), 8);
2225 }
2226 if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
2227 (void) printf(" output:\n");
2228 dump_nvlist(fnvlist_lookup_nvlist(events[i],
2229 ZPOOL_HIST_OUTPUT_NVL), 8);
2230 }
2231 if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
2232 (void) printf(" errno: %lld\n",
2233 (longlong_t)fnvlist_lookup_int64(events[i],
2234 ZPOOL_HIST_ERRNO));
2235 }
2236 } else {
2237 goto next;
2238 }
2239
2240 printed = B_TRUE;
2241 next:
2242 if (dump_opt['h'] > 1) {
2243 if (!printed)
2244 (void) printf("unrecognized record:\n");
2245 dump_nvlist(events[i], 2);
2246 }
2247 }
2248 free(buf);
2249 }
2250
2251 static void
2252 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
2253 {
2254 (void) os, (void) object, (void) data, (void) size;
2255 }
2256
2257 static uint64_t
2258 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
2259 const zbookmark_phys_t *zb)
2260 {
2261 if (dnp == NULL) {
2262 ASSERT(zb->zb_level < 0);
2263 if (zb->zb_object == 0)
2264 return (zb->zb_blkid);
2265 return (zb->zb_blkid * BP_GET_LSIZE(bp));
2266 }
2267
2268 ASSERT(zb->zb_level >= 0);
2269
2270 return ((zb->zb_blkid <<
2271 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
2272 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
2273 }
2274
2275 static void
2276 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
2277 const blkptr_t *bp)
2278 {
2279 abd_t *pabd;
2280 void *buf;
2281 zio_t *zio;
2282 zfs_zstdhdr_t zstd_hdr;
2283 int error;
2284
2285 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
2286 return;
2287
2288 if (BP_IS_HOLE(bp))
2289 return;
2290
2291 if (BP_IS_EMBEDDED(bp)) {
2292 buf = malloc(SPA_MAXBLOCKSIZE);
2293 if (buf == NULL) {
2294 (void) fprintf(stderr, "out of memory\n");
2295 exit(1);
2296 }
2297 decode_embedded_bp_compressed(bp, buf);
2298 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
2299 free(buf);
2300 zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
2301 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
2302 (void) snprintf(blkbuf + strlen(blkbuf),
2303 buflen - strlen(blkbuf),
2304 " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
2305 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
2306 zfs_get_hdrlevel(&zstd_hdr));
2307 return;
2308 }
2309
2310 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
2311 zio = zio_root(spa, NULL, NULL, 0);
2312
2313 /* Decrypt but don't decompress so we can read the compression header */
2314 zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
2315 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
2316 NULL));
2317 error = zio_wait(zio);
2318 if (error) {
2319 (void) fprintf(stderr, "read failed: %d\n", error);
2320 return;
2321 }
2322 buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
2323 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
2324 zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
2325 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
2326
2327 (void) snprintf(blkbuf + strlen(blkbuf),
2328 buflen - strlen(blkbuf),
2329 " ZSTD:size=%u:version=%u:level=%u:NORMAL",
2330 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
2331 zfs_get_hdrlevel(&zstd_hdr));
2332
2333 abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
2334 }
2335
2336 static void
2337 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
2338 boolean_t bp_freed)
2339 {
2340 const dva_t *dva = bp->blk_dva;
2341 int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
2342 int i;
2343
2344 if (dump_opt['b'] >= 6) {
2345 snprintf_blkptr(blkbuf, buflen, bp);
2346 if (bp_freed) {
2347 (void) snprintf(blkbuf + strlen(blkbuf),
2348 buflen - strlen(blkbuf), " %s", "FREE");
2349 }
2350 return;
2351 }
2352
2353 if (BP_IS_EMBEDDED(bp)) {
2354 (void) sprintf(blkbuf,
2355 "EMBEDDED et=%u %llxL/%llxP B=%llu",
2356 (int)BPE_GET_ETYPE(bp),
2357 (u_longlong_t)BPE_GET_LSIZE(bp),
2358 (u_longlong_t)BPE_GET_PSIZE(bp),
2359 (u_longlong_t)bp->blk_birth);
2360 return;
2361 }
2362
2363 blkbuf[0] = '\0';
2364
2365 for (i = 0; i < ndvas; i++)
2366 (void) snprintf(blkbuf + strlen(blkbuf),
2367 buflen - strlen(blkbuf), "%llu:%llx:%llx ",
2368 (u_longlong_t)DVA_GET_VDEV(&dva[i]),
2369 (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
2370 (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
2371
2372 if (BP_IS_HOLE(bp)) {
2373 (void) snprintf(blkbuf + strlen(blkbuf),
2374 buflen - strlen(blkbuf),
2375 "%llxL B=%llu",
2376 (u_longlong_t)BP_GET_LSIZE(bp),
2377 (u_longlong_t)bp->blk_birth);
2378 } else {
2379 (void) snprintf(blkbuf + strlen(blkbuf),
2380 buflen - strlen(blkbuf),
2381 "%llxL/%llxP F=%llu B=%llu/%llu",
2382 (u_longlong_t)BP_GET_LSIZE(bp),
2383 (u_longlong_t)BP_GET_PSIZE(bp),
2384 (u_longlong_t)BP_GET_FILL(bp),
2385 (u_longlong_t)bp->blk_birth,
2386 (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
2387 if (bp_freed)
2388 (void) snprintf(blkbuf + strlen(blkbuf),
2389 buflen - strlen(blkbuf), " %s", "FREE");
2390 (void) snprintf(blkbuf + strlen(blkbuf),
2391 buflen - strlen(blkbuf),
2392 " cksum=%016llx:%016llx:%016llx:%016llx",
2393 (u_longlong_t)bp->blk_cksum.zc_word[0],
2394 (u_longlong_t)bp->blk_cksum.zc_word[1],
2395 (u_longlong_t)bp->blk_cksum.zc_word[2],
2396 (u_longlong_t)bp->blk_cksum.zc_word[3]);
2397 }
2398 }
2399
2400 static void
2401 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
2402 const dnode_phys_t *dnp)
2403 {
2404 char blkbuf[BP_SPRINTF_LEN];
2405 int l;
2406
2407 if (!BP_IS_EMBEDDED(bp)) {
2408 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
2409 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
2410 }
2411
2412 (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
2413
2414 ASSERT(zb->zb_level >= 0);
2415
2416 for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
2417 if (l == zb->zb_level) {
2418 (void) printf("L%llx", (u_longlong_t)zb->zb_level);
2419 } else {
2420 (void) printf(" ");
2421 }
2422 }
2423
2424 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
2425 if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
2426 snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
2427 (void) printf("%s\n", blkbuf);
2428 }
2429
2430 static int
2431 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
2432 blkptr_t *bp, const zbookmark_phys_t *zb)
2433 {
2434 int err = 0;
2435
2436 if (bp->blk_birth == 0)
2437 return (0);
2438
2439 print_indirect(spa, bp, zb, dnp);
2440
2441 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
2442 arc_flags_t flags = ARC_FLAG_WAIT;
2443 int i;
2444 blkptr_t *cbp;
2445 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
2446 arc_buf_t *buf;
2447 uint64_t fill = 0;
2448 ASSERT(!BP_IS_REDACTED(bp));
2449
2450 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
2451 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
2452 if (err)
2453 return (err);
2454 ASSERT(buf->b_data);
2455
2456 /* recursively visit blocks below this */
2457 cbp = buf->b_data;
2458 for (i = 0; i < epb; i++, cbp++) {
2459 zbookmark_phys_t czb;
2460
2461 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
2462 zb->zb_level - 1,
2463 zb->zb_blkid * epb + i);
2464 err = visit_indirect(spa, dnp, cbp, &czb);
2465 if (err)
2466 break;
2467 fill += BP_GET_FILL(cbp);
2468 }
2469 if (!err)
2470 ASSERT3U(fill, ==, BP_GET_FILL(bp));
2471 arc_buf_destroy(buf, &buf);
2472 }
2473
2474 return (err);
2475 }
2476
2477 static void
2478 dump_indirect(dnode_t *dn)
2479 {
2480 dnode_phys_t *dnp = dn->dn_phys;
2481 zbookmark_phys_t czb;
2482
2483 (void) printf("Indirect blocks:\n");
2484
2485 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
2486 dn->dn_object, dnp->dn_nlevels - 1, 0);
2487 for (int j = 0; j < dnp->dn_nblkptr; j++) {
2488 czb.zb_blkid = j;
2489 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
2490 &dnp->dn_blkptr[j], &czb);
2491 }
2492
2493 (void) printf("\n");
2494 }
2495
2496 static void
2497 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
2498 {
2499 (void) os, (void) object;
2500 dsl_dir_phys_t *dd = data;
2501 time_t crtime;
2502 char nice[32];
2503
2504 /* make sure nicenum has enough space */
2505 _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
2506
2507 if (dd == NULL)
2508 return;
2509
2510 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
2511
2512 crtime = dd->dd_creation_time;
2513 (void) printf("\t\tcreation_time = %s", ctime(&crtime));
2514 (void) printf("\t\thead_dataset_obj = %llu\n",
2515 (u_longlong_t)dd->dd_head_dataset_obj);
2516 (void) printf("\t\tparent_dir_obj = %llu\n",
2517 (u_longlong_t)dd->dd_parent_obj);
2518 (void) printf("\t\torigin_obj = %llu\n",
2519 (u_longlong_t)dd->dd_origin_obj);
2520 (void) printf("\t\tchild_dir_zapobj = %llu\n",
2521 (u_longlong_t)dd->dd_child_dir_zapobj);
2522 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
2523 (void) printf("\t\tused_bytes = %s\n", nice);
2524 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
2525 (void) printf("\t\tcompressed_bytes = %s\n", nice);
2526 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
2527 (void) printf("\t\tuncompressed_bytes = %s\n", nice);
2528 zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
2529 (void) printf("\t\tquota = %s\n", nice);
2530 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
2531 (void) printf("\t\treserved = %s\n", nice);
2532 (void) printf("\t\tprops_zapobj = %llu\n",
2533 (u_longlong_t)dd->dd_props_zapobj);
2534 (void) printf("\t\tdeleg_zapobj = %llu\n",
2535 (u_longlong_t)dd->dd_deleg_zapobj);
2536 (void) printf("\t\tflags = %llx\n",
2537 (u_longlong_t)dd->dd_flags);
2538
2539 #define DO(which) \
2540 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
2541 sizeof (nice)); \
2542 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
2543 DO(HEAD);
2544 DO(SNAP);
2545 DO(CHILD);
2546 DO(CHILD_RSRV);
2547 DO(REFRSRV);
2548 #undef DO
2549 (void) printf("\t\tclones = %llu\n",
2550 (u_longlong_t)dd->dd_clones);
2551 }
2552
2553 static void
2554 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
2555 {
2556 (void) os, (void) object;
2557 dsl_dataset_phys_t *ds = data;
2558 time_t crtime;
2559 char used[32], compressed[32], uncompressed[32], unique[32];
2560 char blkbuf[BP_SPRINTF_LEN];
2561
2562 /* make sure nicenum has enough space */
2563 _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
2564 _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
2565 "compressed truncated");
2566 _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
2567 "uncompressed truncated");
2568 _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
2569
2570 if (ds == NULL)
2571 return;
2572
2573 ASSERT(size == sizeof (*ds));
2574 crtime = ds->ds_creation_time;
2575 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
2576 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
2577 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
2578 sizeof (uncompressed));
2579 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
2580 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
2581
2582 (void) printf("\t\tdir_obj = %llu\n",
2583 (u_longlong_t)ds->ds_dir_obj);
2584 (void) printf("\t\tprev_snap_obj = %llu\n",
2585 (u_longlong_t)ds->ds_prev_snap_obj);
2586 (void) printf("\t\tprev_snap_txg = %llu\n",
2587 (u_longlong_t)ds->ds_prev_snap_txg);
2588 (void) printf("\t\tnext_snap_obj = %llu\n",
2589 (u_longlong_t)ds->ds_next_snap_obj);
2590 (void) printf("\t\tsnapnames_zapobj = %llu\n",
2591 (u_longlong_t)ds->ds_snapnames_zapobj);
2592 (void) printf("\t\tnum_children = %llu\n",
2593 (u_longlong_t)ds->ds_num_children);
2594 (void) printf("\t\tuserrefs_obj = %llu\n",
2595 (u_longlong_t)ds->ds_userrefs_obj);
2596 (void) printf("\t\tcreation_time = %s", ctime(&crtime));
2597 (void) printf("\t\tcreation_txg = %llu\n",
2598 (u_longlong_t)ds->ds_creation_txg);
2599 (void) printf("\t\tdeadlist_obj = %llu\n",
2600 (u_longlong_t)ds->ds_deadlist_obj);
2601 (void) printf("\t\tused_bytes = %s\n", used);
2602 (void) printf("\t\tcompressed_bytes = %s\n", compressed);
2603 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
2604 (void) printf("\t\tunique = %s\n", unique);
2605 (void) printf("\t\tfsid_guid = %llu\n",
2606 (u_longlong_t)ds->ds_fsid_guid);
2607 (void) printf("\t\tguid = %llu\n",
2608 (u_longlong_t)ds->ds_guid);
2609 (void) printf("\t\tflags = %llx\n",
2610 (u_longlong_t)ds->ds_flags);
2611 (void) printf("\t\tnext_clones_obj = %llu\n",
2612 (u_longlong_t)ds->ds_next_clones_obj);
2613 (void) printf("\t\tprops_obj = %llu\n",
2614 (u_longlong_t)ds->ds_props_obj);
2615 (void) printf("\t\tbp = %s\n", blkbuf);
2616 }
2617
2618 static int
2619 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2620 {
2621 (void) arg, (void) tx;
2622 char blkbuf[BP_SPRINTF_LEN];
2623
2624 if (bp->blk_birth != 0) {
2625 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2626 (void) printf("\t%s\n", blkbuf);
2627 }
2628 return (0);
2629 }
2630
2631 static void
2632 dump_bptree(objset_t *os, uint64_t obj, const char *name)
2633 {
2634 char bytes[32];
2635 bptree_phys_t *bt;
2636 dmu_buf_t *db;
2637
2638 /* make sure nicenum has enough space */
2639 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2640
2641 if (dump_opt['d'] < 3)
2642 return;
2643
2644 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
2645 bt = db->db_data;
2646 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
2647 (void) printf("\n %s: %llu datasets, %s\n",
2648 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
2649 dmu_buf_rele(db, FTAG);
2650
2651 if (dump_opt['d'] < 5)
2652 return;
2653
2654 (void) printf("\n");
2655
2656 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
2657 }
2658
2659 static int
2660 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
2661 {
2662 (void) arg, (void) tx;
2663 char blkbuf[BP_SPRINTF_LEN];
2664
2665 ASSERT(bp->blk_birth != 0);
2666 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
2667 (void) printf("\t%s\n", blkbuf);
2668 return (0);
2669 }
2670
2671 static void
2672 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
2673 {
2674 char bytes[32];
2675 char comp[32];
2676 char uncomp[32];
2677 uint64_t i;
2678
2679 /* make sure nicenum has enough space */
2680 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2681 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
2682 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
2683
2684 if (dump_opt['d'] < 3)
2685 return;
2686
2687 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
2688 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2689 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
2690 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
2691 if (bpo->bpo_havefreed) {
2692 (void) printf(" %*s: object %llu, %llu local "
2693 "blkptrs, %llu freed, %llu subobjs in object %llu, "
2694 "%s (%s/%s comp)\n",
2695 indent * 8, name,
2696 (u_longlong_t)bpo->bpo_object,
2697 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2698 (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
2699 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
2700 (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
2701 bytes, comp, uncomp);
2702 } else {
2703 (void) printf(" %*s: object %llu, %llu local "
2704 "blkptrs, %llu subobjs in object %llu, "
2705 "%s (%s/%s comp)\n",
2706 indent * 8, name,
2707 (u_longlong_t)bpo->bpo_object,
2708 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2709 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
2710 (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
2711 bytes, comp, uncomp);
2712 }
2713
2714 for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2715 uint64_t subobj;
2716 bpobj_t subbpo;
2717 int error;
2718 VERIFY0(dmu_read(bpo->bpo_os,
2719 bpo->bpo_phys->bpo_subobjs,
2720 i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2721 error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2722 if (error != 0) {
2723 (void) printf("ERROR %u while trying to open "
2724 "subobj id %llu\n",
2725 error, (u_longlong_t)subobj);
2726 continue;
2727 }
2728 dump_full_bpobj(&subbpo, "subobj", indent + 1);
2729 bpobj_close(&subbpo);
2730 }
2731 } else {
2732 if (bpo->bpo_havefreed) {
2733 (void) printf(" %*s: object %llu, %llu blkptrs, "
2734 "%llu freed, %s\n",
2735 indent * 8, name,
2736 (u_longlong_t)bpo->bpo_object,
2737 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2738 (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
2739 bytes);
2740 } else {
2741 (void) printf(" %*s: object %llu, %llu blkptrs, "
2742 "%s\n",
2743 indent * 8, name,
2744 (u_longlong_t)bpo->bpo_object,
2745 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2746 bytes);
2747 }
2748 }
2749
2750 if (dump_opt['d'] < 5)
2751 return;
2752
2753
2754 if (indent == 0) {
2755 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
2756 (void) printf("\n");
2757 }
2758 }
2759
2760 static int
2761 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
2762 boolean_t print_list)
2763 {
2764 int err = 0;
2765 zfs_bookmark_phys_t prop;
2766 objset_t *mos = dp->dp_spa->spa_meta_objset;
2767 err = dsl_bookmark_lookup(dp, name, NULL, &prop);
2768
2769 if (err != 0) {
2770 return (err);
2771 }
2772
2773 (void) printf("\t#%s: ", strchr(name, '#') + 1);
2774 (void) printf("{guid: %llx creation_txg: %llu creation_time: "
2775 "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
2776 (u_longlong_t)prop.zbm_creation_txg,
2777 (u_longlong_t)prop.zbm_creation_time,
2778 (u_longlong_t)prop.zbm_redaction_obj);
2779
2780 IMPLY(print_list, print_redact);
2781 if (!print_redact || prop.zbm_redaction_obj == 0)
2782 return (0);
2783
2784 redaction_list_t *rl;
2785 VERIFY0(dsl_redaction_list_hold_obj(dp,
2786 prop.zbm_redaction_obj, FTAG, &rl));
2787
2788 redaction_list_phys_t *rlp = rl->rl_phys;
2789 (void) printf("\tRedacted:\n\t\tProgress: ");
2790 if (rlp->rlp_last_object != UINT64_MAX ||
2791 rlp->rlp_last_blkid != UINT64_MAX) {
2792 (void) printf("%llu %llu (incomplete)\n",
2793 (u_longlong_t)rlp->rlp_last_object,
2794 (u_longlong_t)rlp->rlp_last_blkid);
2795 } else {
2796 (void) printf("complete\n");
2797 }
2798 (void) printf("\t\tSnapshots: [");
2799 for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
2800 if (i > 0)
2801 (void) printf(", ");
2802 (void) printf("%0llu",
2803 (u_longlong_t)rlp->rlp_snaps[i]);
2804 }
2805 (void) printf("]\n\t\tLength: %llu\n",
2806 (u_longlong_t)rlp->rlp_num_entries);
2807
2808 if (!print_list) {
2809 dsl_redaction_list_rele(rl, FTAG);
2810 return (0);
2811 }
2812
2813 if (rlp->rlp_num_entries == 0) {
2814 dsl_redaction_list_rele(rl, FTAG);
2815 (void) printf("\t\tRedaction List: []\n\n");
2816 return (0);
2817 }
2818
2819 redact_block_phys_t *rbp_buf;
2820 uint64_t size;
2821 dmu_object_info_t doi;
2822
2823 VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
2824 size = doi.doi_max_offset;
2825 rbp_buf = kmem_alloc(size, KM_SLEEP);
2826
2827 err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
2828 rbp_buf, 0);
2829 if (err != 0) {
2830 dsl_redaction_list_rele(rl, FTAG);
2831 kmem_free(rbp_buf, size);
2832 return (err);
2833 }
2834
2835 (void) printf("\t\tRedaction List: [{object: %llx, offset: "
2836 "%llx, blksz: %x, count: %llx}",
2837 (u_longlong_t)rbp_buf[0].rbp_object,
2838 (u_longlong_t)rbp_buf[0].rbp_blkid,
2839 (uint_t)(redact_block_get_size(&rbp_buf[0])),
2840 (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
2841
2842 for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
2843 (void) printf(",\n\t\t{object: %llx, offset: %llx, "
2844 "blksz: %x, count: %llx}",
2845 (u_longlong_t)rbp_buf[i].rbp_object,
2846 (u_longlong_t)rbp_buf[i].rbp_blkid,
2847 (uint_t)(redact_block_get_size(&rbp_buf[i])),
2848 (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
2849 }
2850 dsl_redaction_list_rele(rl, FTAG);
2851 kmem_free(rbp_buf, size);
2852 (void) printf("]\n\n");
2853 return (0);
2854 }
2855
2856 static void
2857 dump_bookmarks(objset_t *os, int verbosity)
2858 {
2859 zap_cursor_t zc;
2860 zap_attribute_t attr;
2861 dsl_dataset_t *ds = dmu_objset_ds(os);
2862 dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2863 objset_t *mos = os->os_spa->spa_meta_objset;
2864 if (verbosity < 4)
2865 return;
2866 dsl_pool_config_enter(dp, FTAG);
2867
2868 for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
2869 zap_cursor_retrieve(&zc, &attr) == 0;
2870 zap_cursor_advance(&zc)) {
2871 char osname[ZFS_MAX_DATASET_NAME_LEN];
2872 char buf[ZFS_MAX_DATASET_NAME_LEN];
2873 int len;
2874 dmu_objset_name(os, osname);
2875 len = snprintf(buf, sizeof (buf), "%s#%s", osname,
2876 attr.za_name);
2877 VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
2878 (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
2879 }
2880 zap_cursor_fini(&zc);
2881 dsl_pool_config_exit(dp, FTAG);
2882 }
2883
2884 static void
2885 bpobj_count_refd(bpobj_t *bpo)
2886 {
2887 mos_obj_refd(bpo->bpo_object);
2888
2889 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2890 mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
2891 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2892 uint64_t subobj;
2893 bpobj_t subbpo;
2894 int error;
2895 VERIFY0(dmu_read(bpo->bpo_os,
2896 bpo->bpo_phys->bpo_subobjs,
2897 i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2898 error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2899 if (error != 0) {
2900 (void) printf("ERROR %u while trying to open "
2901 "subobj id %llu\n",
2902 error, (u_longlong_t)subobj);
2903 continue;
2904 }
2905 bpobj_count_refd(&subbpo);
2906 bpobj_close(&subbpo);
2907 }
2908 }
2909 }
2910
2911 static int
2912 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
2913 {
2914 spa_t *spa = arg;
2915 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2916 if (dle->dle_bpobj.bpo_object != empty_bpobj)
2917 bpobj_count_refd(&dle->dle_bpobj);
2918 return (0);
2919 }
2920
2921 static int
2922 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
2923 {
2924 ASSERT(arg == NULL);
2925 if (dump_opt['d'] >= 5) {
2926 char buf[128];
2927 (void) snprintf(buf, sizeof (buf),
2928 "mintxg %llu -> obj %llu",
2929 (longlong_t)dle->dle_mintxg,
2930 (longlong_t)dle->dle_bpobj.bpo_object);
2931
2932 dump_full_bpobj(&dle->dle_bpobj, buf, 0);
2933 } else {
2934 (void) printf("mintxg %llu -> obj %llu\n",
2935 (longlong_t)dle->dle_mintxg,
2936 (longlong_t)dle->dle_bpobj.bpo_object);
2937 }
2938 return (0);
2939 }
2940
2941 static void
2942 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
2943 {
2944 char bytes[32];
2945 char comp[32];
2946 char uncomp[32];
2947 char entries[32];
2948 spa_t *spa = dmu_objset_spa(dl->dl_os);
2949 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2950
2951 if (dl->dl_oldfmt) {
2952 if (dl->dl_bpobj.bpo_object != empty_bpobj)
2953 bpobj_count_refd(&dl->dl_bpobj);
2954 } else {
2955 mos_obj_refd(dl->dl_object);
2956 dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
2957 }
2958
2959 /* make sure nicenum has enough space */
2960 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2961 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
2962 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
2963 _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
2964
2965 if (dump_opt['d'] < 3)
2966 return;
2967
2968 if (dl->dl_oldfmt) {
2969 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
2970 return;
2971 }
2972
2973 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
2974 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
2975 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
2976 zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
2977 (void) printf("\n %s: %s (%s/%s comp), %s entries\n",
2978 name, bytes, comp, uncomp, entries);
2979
2980 if (dump_opt['d'] < 4)
2981 return;
2982
2983 (void) putchar('\n');
2984
2985 dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
2986 }
2987
2988 static int
2989 verify_dd_livelist(objset_t *os)
2990 {
2991 uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
2992 dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2993 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
2994
2995 ASSERT(!dmu_objset_is_snapshot(os));
2996 if (!dsl_deadlist_is_open(&dd->dd_livelist))
2997 return (0);
2998
2999 /* Iterate through the livelist to check for duplicates */
3000 dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
3001 NULL);
3002
3003 dsl_pool_config_enter(dp, FTAG);
3004 dsl_deadlist_space(&dd->dd_livelist, &ll_used,
3005 &ll_comp, &ll_uncomp);
3006
3007 dsl_dataset_t *origin_ds;
3008 ASSERT(dsl_pool_config_held(dp));
3009 VERIFY0(dsl_dataset_hold_obj(dp,
3010 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
3011 VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
3012 &used, &comp, &uncomp));
3013 dsl_dataset_rele(origin_ds, FTAG);
3014 dsl_pool_config_exit(dp, FTAG);
3015 /*
3016 * It's possible that the dataset's uncomp space is larger than the
3017 * livelist's because livelists do not track embedded block pointers
3018 */
3019 if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
3020 char nice_used[32], nice_comp[32], nice_uncomp[32];
3021 (void) printf("Discrepancy in space accounting:\n");
3022 zdb_nicenum(used, nice_used, sizeof (nice_used));
3023 zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
3024 zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
3025 (void) printf("dir: used %s, comp %s, uncomp %s\n",
3026 nice_used, nice_comp, nice_uncomp);
3027 zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
3028 zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
3029 zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
3030 (void) printf("livelist: used %s, comp %s, uncomp %s\n",
3031 nice_used, nice_comp, nice_uncomp);
3032 return (1);
3033 }
3034 return (0);
3035 }
3036
3037 static char *key_material = NULL;
3038
3039 static boolean_t
3040 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
3041 {
3042 uint64_t keyformat, salt, iters;
3043 int i;
3044 unsigned char c;
3045
3046 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
3047 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
3048 1, &keyformat));
3049
3050 switch (keyformat) {
3051 case ZFS_KEYFORMAT_HEX:
3052 for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
3053 if (!isxdigit(key_material[i]) ||
3054 !isxdigit(key_material[i+1]))
3055 return (B_FALSE);
3056 if (sscanf(&key_material[i], "%02hhx", &c) != 1)
3057 return (B_FALSE);
3058 key_out[i / 2] = c;
3059 }
3060 break;
3061
3062 case ZFS_KEYFORMAT_PASSPHRASE:
3063 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
3064 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
3065 sizeof (uint64_t), 1, &salt));
3066 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
3067 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
3068 sizeof (uint64_t), 1, &iters));
3069
3070 if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
3071 ((uint8_t *)&salt), sizeof (uint64_t), iters,
3072 WRAPPING_KEY_LEN, key_out) != 1)
3073 return (B_FALSE);
3074
3075 break;
3076
3077 default:
3078 fatal("no support for key format %u\n",
3079 (unsigned int) keyformat);
3080 }
3081
3082 return (B_TRUE);
3083 }
3084
3085 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
3086 static boolean_t key_loaded = B_FALSE;
3087
3088 static void
3089 zdb_load_key(objset_t *os)
3090 {
3091 dsl_pool_t *dp;
3092 dsl_dir_t *dd, *rdd;
3093 uint8_t key[WRAPPING_KEY_LEN];
3094 uint64_t rddobj;
3095 int err;
3096
3097 dp = spa_get_dsl(os->os_spa);
3098 dd = os->os_dsl_dataset->ds_dir;
3099
3100 dsl_pool_config_enter(dp, FTAG);
3101 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
3102 DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
3103 VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
3104 dsl_dir_name(rdd, encroot);
3105 dsl_dir_rele(rdd, FTAG);
3106
3107 if (!zdb_derive_key(dd, key))
3108 fatal("couldn't derive encryption key");
3109
3110 dsl_pool_config_exit(dp, FTAG);
3111
3112 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
3113
3114 dsl_crypto_params_t *dcp;
3115 nvlist_t *crypto_args;
3116
3117 crypto_args = fnvlist_alloc();
3118 fnvlist_add_uint8_array(crypto_args, "wkeydata",
3119 (uint8_t *)key, WRAPPING_KEY_LEN);
3120 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
3121 NULL, crypto_args, &dcp));
3122 err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
3123
3124 dsl_crypto_params_free(dcp, (err != 0));
3125 fnvlist_free(crypto_args);
3126
3127 if (err != 0)
3128 fatal(
3129 "couldn't load encryption key for %s: %s",
3130 encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
3131 "crypto params not supported" : strerror(err));
3132
3133 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
3134
3135 printf("Unlocked encryption root: %s\n", encroot);
3136 key_loaded = B_TRUE;
3137 }
3138
3139 static void
3140 zdb_unload_key(void)
3141 {
3142 if (!key_loaded)
3143 return;
3144
3145 VERIFY0(spa_keystore_unload_wkey(encroot));
3146 key_loaded = B_FALSE;
3147 }
3148
3149 static avl_tree_t idx_tree;
3150 static avl_tree_t domain_tree;
3151 static boolean_t fuid_table_loaded;
3152 static objset_t *sa_os = NULL;
3153 static sa_attr_type_t *sa_attr_table = NULL;
3154
3155 static int
3156 open_objset(const char *path, const void *tag, objset_t **osp)
3157 {
3158 int err;
3159 uint64_t sa_attrs = 0;
3160 uint64_t version = 0;
3161
3162 VERIFY3P(sa_os, ==, NULL);
3163
3164 /*
3165 * We can't own an objset if it's redacted. Therefore, we do this
3166 * dance: hold the objset, then acquire a long hold on its dataset, then
3167 * release the pool (which is held as part of holding the objset).
3168 */
3169
3170 if (dump_opt['K']) {
3171 /* decryption requested, try to load keys */
3172 err = dmu_objset_hold(path, tag, osp);
3173 if (err != 0) {
3174 (void) fprintf(stderr, "failed to hold dataset "
3175 "'%s': %s\n",
3176 path, strerror(err));
3177 return (err);
3178 }
3179 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
3180 dsl_pool_rele(dmu_objset_pool(*osp), tag);
3181
3182 /* succeeds or dies */
3183 zdb_load_key(*osp);
3184
3185 /* release it all */
3186 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
3187 dsl_dataset_rele(dmu_objset_ds(*osp), tag);
3188 }
3189
3190 int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
3191
3192 err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
3193 if (err != 0) {
3194 (void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
3195 path, strerror(err));
3196 return (err);
3197 }
3198 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
3199 dsl_pool_rele(dmu_objset_pool(*osp), tag);
3200
3201 if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
3202 (key_loaded || !(*osp)->os_encrypted)) {
3203 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
3204 8, 1, &version);
3205 if (version >= ZPL_VERSION_SA) {
3206 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
3207 8, 1, &sa_attrs);
3208 }
3209 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
3210 &sa_attr_table);
3211 if (err != 0) {
3212 (void) fprintf(stderr, "sa_setup failed: %s\n",
3213 strerror(err));
3214 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
3215 dsl_dataset_rele_flags(dmu_objset_ds(*osp),
3216 ds_hold_flags, tag);
3217 *osp = NULL;
3218 }
3219 }
3220 sa_os = *osp;
3221
3222 return (err);
3223 }
3224
3225 static void
3226 close_objset(objset_t *os, const void *tag)
3227 {
3228 VERIFY3P(os, ==, sa_os);
3229 if (os->os_sa != NULL)
3230 sa_tear_down(os);
3231 dsl_dataset_long_rele(dmu_objset_ds(os), tag);
3232 dsl_dataset_rele_flags(dmu_objset_ds(os),
3233 key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
3234 sa_attr_table = NULL;
3235 sa_os = NULL;
3236
3237 zdb_unload_key();
3238 }
3239
3240 static void
3241 fuid_table_destroy(void)
3242 {
3243 if (fuid_table_loaded) {
3244 zfs_fuid_table_destroy(&idx_tree, &domain_tree);
3245 fuid_table_loaded = B_FALSE;
3246 }
3247 }
3248
3249 /*
3250 * print uid or gid information.
3251 * For normal POSIX id just the id is printed in decimal format.
3252 * For CIFS files with FUID the fuid is printed in hex followed by
3253 * the domain-rid string.
3254 */
3255 static void
3256 print_idstr(uint64_t id, const char *id_type)
3257 {
3258 if (FUID_INDEX(id)) {
3259 const char *domain =
3260 zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
3261 (void) printf("\t%s %llx [%s-%d]\n", id_type,
3262 (u_longlong_t)id, domain, (int)FUID_RID(id));
3263 } else {
3264 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
3265 }
3266
3267 }
3268
3269 static void
3270 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
3271 {
3272 uint32_t uid_idx, gid_idx;
3273
3274 uid_idx = FUID_INDEX(uid);
3275 gid_idx = FUID_INDEX(gid);
3276
3277 /* Load domain table, if not already loaded */
3278 if (!fuid_table_loaded && (uid_idx || gid_idx)) {
3279 uint64_t fuid_obj;
3280
3281 /* first find the fuid object. It lives in the master node */
3282 VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
3283 8, 1, &fuid_obj) == 0);
3284 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
3285 (void) zfs_fuid_table_load(os, fuid_obj,
3286 &idx_tree, &domain_tree);
3287 fuid_table_loaded = B_TRUE;
3288 }
3289
3290 print_idstr(uid, "uid");
3291 print_idstr(gid, "gid");
3292 }
3293
3294 static void
3295 dump_znode_sa_xattr(sa_handle_t *hdl)
3296 {
3297 nvlist_t *sa_xattr;
3298 nvpair_t *elem = NULL;
3299 int sa_xattr_size = 0;
3300 int sa_xattr_entries = 0;
3301 int error;
3302 char *sa_xattr_packed;
3303
3304 error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
3305 if (error || sa_xattr_size == 0)
3306 return;
3307
3308 sa_xattr_packed = malloc(sa_xattr_size);
3309 if (sa_xattr_packed == NULL)
3310 return;
3311
3312 error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
3313 sa_xattr_packed, sa_xattr_size);
3314 if (error) {
3315 free(sa_xattr_packed);
3316 return;
3317 }
3318
3319 error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
3320 if (error) {
3321 free(sa_xattr_packed);
3322 return;
3323 }
3324
3325 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
3326 sa_xattr_entries++;
3327
3328 (void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
3329 sa_xattr_size, sa_xattr_entries);
3330 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
3331 boolean_t can_print = !dump_opt['P'];
3332 uchar_t *value;
3333 uint_t cnt, idx;
3334
3335 (void) printf("\t\t%s = ", nvpair_name(elem));
3336 nvpair_value_byte_array(elem, &value, &cnt);
3337
3338 for (idx = 0; idx < cnt; ++idx) {
3339 if (!isprint(value[idx])) {
3340 can_print = B_FALSE;
3341 break;
3342 }
3343 }
3344
3345 for (idx = 0; idx < cnt; ++idx) {
3346 if (can_print)
3347 (void) putchar(value[idx]);
3348 else
3349 (void) printf("\\%3.3o", value[idx]);
3350 }
3351 (void) putchar('\n');
3352 }
3353
3354 nvlist_free(sa_xattr);
3355 free(sa_xattr_packed);
3356 }
3357
3358 static void
3359 dump_znode_symlink(sa_handle_t *hdl)
3360 {
3361 int sa_symlink_size = 0;
3362 char linktarget[MAXPATHLEN];
3363 int error;
3364
3365 error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
3366 if (error || sa_symlink_size == 0) {
3367 return;
3368 }
3369 if (sa_symlink_size >= sizeof (linktarget)) {
3370 (void) printf("symlink size %d is too large\n",
3371 sa_symlink_size);
3372 return;
3373 }
3374 linktarget[sa_symlink_size] = '\0';
3375 if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
3376 &linktarget, sa_symlink_size) == 0)
3377 (void) printf("\ttarget %s\n", linktarget);
3378 }
3379
3380 static void
3381 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
3382 {
3383 (void) data, (void) size;
3384 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
3385 sa_handle_t *hdl;
3386 uint64_t xattr, rdev, gen;
3387 uint64_t uid, gid, mode, fsize, parent, links;
3388 uint64_t pflags;
3389 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
3390 time_t z_crtime, z_atime, z_mtime, z_ctime;
3391 sa_bulk_attr_t bulk[12];
3392 int idx = 0;
3393 int error;
3394
3395 VERIFY3P(os, ==, sa_os);
3396 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
3397 (void) printf("Failed to get handle for SA znode\n");
3398 return;
3399 }
3400
3401 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
3402 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
3403 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
3404 &links, 8);
3405 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
3406 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
3407 &mode, 8);
3408 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
3409 NULL, &parent, 8);
3410 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
3411 &fsize, 8);
3412 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
3413 acctm, 16);
3414 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
3415 modtm, 16);
3416 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
3417 crtm, 16);
3418 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
3419 chgtm, 16);
3420 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
3421 &pflags, 8);
3422
3423 if (sa_bulk_lookup(hdl, bulk, idx)) {
3424 (void) sa_handle_destroy(hdl);
3425 return;
3426 }
3427
3428 z_crtime = (time_t)crtm[0];
3429 z_atime = (time_t)acctm[0];
3430 z_mtime = (time_t)modtm[0];
3431 z_ctime = (time_t)chgtm[0];
3432
3433 if (dump_opt['d'] > 4) {
3434 error = zfs_obj_to_path(os, object, path, sizeof (path));
3435 if (error == ESTALE) {
3436 (void) snprintf(path, sizeof (path), "on delete queue");
3437 } else if (error != 0) {
3438 leaked_objects++;
3439 (void) snprintf(path, sizeof (path),
3440 "path not found, possibly leaked");
3441 }
3442 (void) printf("\tpath %s\n", path);
3443 }
3444
3445 if (S_ISLNK(mode))
3446 dump_znode_symlink(hdl);
3447 dump_uidgid(os, uid, gid);
3448 (void) printf("\tatime %s", ctime(&z_atime));
3449 (void) printf("\tmtime %s", ctime(&z_mtime));
3450 (void) printf("\tctime %s", ctime(&z_ctime));
3451 (void) printf("\tcrtime %s", ctime(&z_crtime));
3452 (void) printf("\tgen %llu\n", (u_longlong_t)gen);
3453 (void) printf("\tmode %llo\n", (u_longlong_t)mode);
3454 (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
3455 (void) printf("\tparent %llu\n", (u_longlong_t)parent);
3456 (void) printf("\tlinks %llu\n", (u_longlong_t)links);
3457 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
3458 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
3459 uint64_t projid;
3460
3461 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
3462 sizeof (uint64_t)) == 0)
3463 (void) printf("\tprojid %llu\n", (u_longlong_t)projid);
3464 }
3465 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
3466 sizeof (uint64_t)) == 0)
3467 (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
3468 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
3469 sizeof (uint64_t)) == 0)
3470 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
3471 dump_znode_sa_xattr(hdl);
3472 sa_handle_destroy(hdl);
3473 }
3474
3475 static void
3476 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
3477 {
3478 (void) os, (void) object, (void) data, (void) size;
3479 }
3480
3481 static void
3482 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
3483 {
3484 (void) os, (void) object, (void) data, (void) size;
3485 }
3486
3487 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
3488 dump_none, /* unallocated */
3489 dump_zap, /* object directory */
3490 dump_uint64, /* object array */
3491 dump_none, /* packed nvlist */
3492 dump_packed_nvlist, /* packed nvlist size */
3493 dump_none, /* bpobj */
3494 dump_bpobj, /* bpobj header */
3495 dump_none, /* SPA space map header */
3496 dump_none, /* SPA space map */
3497 dump_none, /* ZIL intent log */
3498 dump_dnode, /* DMU dnode */
3499 dump_dmu_objset, /* DMU objset */
3500 dump_dsl_dir, /* DSL directory */
3501 dump_zap, /* DSL directory child map */
3502 dump_zap, /* DSL dataset snap map */
3503 dump_zap, /* DSL props */
3504 dump_dsl_dataset, /* DSL dataset */
3505 dump_znode, /* ZFS znode */
3506 dump_acl, /* ZFS V0 ACL */
3507 dump_uint8, /* ZFS plain file */
3508 dump_zpldir, /* ZFS directory */
3509 dump_zap, /* ZFS master node */
3510 dump_zap, /* ZFS delete queue */
3511 dump_uint8, /* zvol object */
3512 dump_zap, /* zvol prop */
3513 dump_uint8, /* other uint8[] */
3514 dump_uint64, /* other uint64[] */
3515 dump_zap, /* other ZAP */
3516 dump_zap, /* persistent error log */
3517 dump_uint8, /* SPA history */
3518 dump_history_offsets, /* SPA history offsets */
3519 dump_zap, /* Pool properties */
3520 dump_zap, /* DSL permissions */
3521 dump_acl, /* ZFS ACL */
3522 dump_uint8, /* ZFS SYSACL */
3523 dump_none, /* FUID nvlist */
3524 dump_packed_nvlist, /* FUID nvlist size */
3525 dump_zap, /* DSL dataset next clones */
3526 dump_zap, /* DSL scrub queue */
3527 dump_zap, /* ZFS user/group/project used */
3528 dump_zap, /* ZFS user/group/project quota */
3529 dump_zap, /* snapshot refcount tags */
3530 dump_ddt_zap, /* DDT ZAP object */
3531 dump_zap, /* DDT statistics */
3532 dump_znode, /* SA object */
3533 dump_zap, /* SA Master Node */
3534 dump_sa_attrs, /* SA attribute registration */
3535 dump_sa_layouts, /* SA attribute layouts */
3536 dump_zap, /* DSL scrub translations */
3537 dump_none, /* fake dedup BP */
3538 dump_zap, /* deadlist */
3539 dump_none, /* deadlist hdr */
3540 dump_zap, /* dsl clones */
3541 dump_bpobj_subobjs, /* bpobj subobjs */
3542 dump_unknown, /* Unknown type, must be last */
3543 };
3544
3545 static boolean_t
3546 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
3547 {
3548 boolean_t match = B_TRUE;
3549
3550 switch (obj_type) {
3551 case DMU_OT_DIRECTORY_CONTENTS:
3552 if (!(flags & ZOR_FLAG_DIRECTORY))
3553 match = B_FALSE;
3554 break;
3555 case DMU_OT_PLAIN_FILE_CONTENTS:
3556 if (!(flags & ZOR_FLAG_PLAIN_FILE))
3557 match = B_FALSE;
3558 break;
3559 case DMU_OT_SPACE_MAP:
3560 if (!(flags & ZOR_FLAG_SPACE_MAP))
3561 match = B_FALSE;
3562 break;
3563 default:
3564 if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
3565 if (!(flags & ZOR_FLAG_ZAP))
3566 match = B_FALSE;
3567 break;
3568 }
3569
3570 /*
3571 * If all bits except some of the supported flags are
3572 * set, the user combined the all-types flag (A) with
3573 * a negated flag to exclude some types (e.g. A-f to
3574 * show all object types except plain files).
3575 */
3576 if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
3577 match = B_FALSE;
3578
3579 break;
3580 }
3581
3582 return (match);
3583 }
3584
3585 static void
3586 dump_object(objset_t *os, uint64_t object, int verbosity,
3587 boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
3588 {
3589 dmu_buf_t *db = NULL;
3590 dmu_object_info_t doi;
3591 dnode_t *dn;
3592 boolean_t dnode_held = B_FALSE;
3593 void *bonus = NULL;
3594 size_t bsize = 0;
3595 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
3596 char bonus_size[32];
3597 char aux[50];
3598 int error;
3599
3600 /* make sure nicenum has enough space */
3601 _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
3602 _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
3603 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
3604 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
3605 _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
3606 "bonus_size truncated");
3607
3608 if (*print_header) {
3609 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
3610 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
3611 "lsize", "%full", "type");
3612 *print_header = 0;
3613 }
3614
3615 if (object == 0) {
3616 dn = DMU_META_DNODE(os);
3617 dmu_object_info_from_dnode(dn, &doi);
3618 } else {
3619 /*
3620 * Encrypted datasets will have sensitive bonus buffers
3621 * encrypted. Therefore we cannot hold the bonus buffer and
3622 * must hold the dnode itself instead.
3623 */
3624 error = dmu_object_info(os, object, &doi);
3625 if (error)
3626 fatal("dmu_object_info() failed, errno %u", error);
3627
3628 if (!key_loaded && os->os_encrypted &&
3629 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
3630 error = dnode_hold(os, object, FTAG, &dn);
3631 if (error)
3632 fatal("dnode_hold() failed, errno %u", error);
3633 dnode_held = B_TRUE;
3634 } else {
3635 error = dmu_bonus_hold(os, object, FTAG, &db);
3636 if (error)
3637 fatal("dmu_bonus_hold(%llu) failed, errno %u",
3638 object, error);
3639 bonus = db->db_data;
3640 bsize = db->db_size;
3641 dn = DB_DNODE((dmu_buf_impl_t *)db);
3642 }
3643 }
3644
3645 /*
3646 * Default to showing all object types if no flags were specified.
3647 */
3648 if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
3649 !match_object_type(doi.doi_type, flags))
3650 goto out;
3651
3652 if (dnode_slots_used)
3653 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
3654
3655 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
3656 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
3657 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
3658 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
3659 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
3660 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
3661 (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
3662 doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
3663 DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
3664
3665 aux[0] = '\0';
3666
3667 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
3668 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3669 " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
3670 }
3671
3672 if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
3673 ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
3674 const char *compname = NULL;
3675 if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
3676 ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
3677 &compname) == 0) {
3678 (void) snprintf(aux + strlen(aux),
3679 sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
3680 compname);
3681 } else {
3682 (void) snprintf(aux + strlen(aux),
3683 sizeof (aux) - strlen(aux),
3684 " (Z=inherit=%s-unknown)",
3685 ZDB_COMPRESS_NAME(os->os_compress));
3686 }
3687 } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
3688 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3689 " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
3690 } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
3691 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3692 " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
3693 }
3694
3695 (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
3696 (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
3697 asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
3698
3699 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
3700 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
3701 "", "", "", "", "", "", bonus_size, "bonus",
3702 zdb_ot_name(doi.doi_bonus_type));
3703 }
3704
3705 if (verbosity >= 4) {
3706 (void) printf("\tdnode flags: %s%s%s%s\n",
3707 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
3708 "USED_BYTES " : "",
3709 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
3710 "USERUSED_ACCOUNTED " : "",
3711 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
3712 "USEROBJUSED_ACCOUNTED " : "",
3713 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
3714 "SPILL_BLKPTR" : "");
3715 (void) printf("\tdnode maxblkid: %llu\n",
3716 (longlong_t)dn->dn_phys->dn_maxblkid);
3717
3718 if (!dnode_held) {
3719 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
3720 object, bonus, bsize);
3721 } else {
3722 (void) printf("\t\t(bonus encrypted)\n");
3723 }
3724
3725 if (key_loaded ||
3726 (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
3727 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
3728 NULL, 0);
3729 } else {
3730 (void) printf("\t\t(object encrypted)\n");
3731 }
3732
3733 *print_header = B_TRUE;
3734 }
3735
3736 if (verbosity >= 5) {
3737 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
3738 char blkbuf[BP_SPRINTF_LEN];
3739 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
3740 DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
3741 (void) printf("\nSpill block: %s\n", blkbuf);
3742 }
3743 dump_indirect(dn);
3744 }
3745
3746 if (verbosity >= 5) {
3747 /*
3748 * Report the list of segments that comprise the object.
3749 */
3750 uint64_t start = 0;
3751 uint64_t end;
3752 uint64_t blkfill = 1;
3753 int minlvl = 1;
3754
3755 if (dn->dn_type == DMU_OT_DNODE) {
3756 minlvl = 0;
3757 blkfill = DNODES_PER_BLOCK;
3758 }
3759
3760 for (;;) {
3761 char segsize[32];
3762 /* make sure nicenum has enough space */
3763 _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
3764 "segsize truncated");
3765 error = dnode_next_offset(dn,
3766 0, &start, minlvl, blkfill, 0);
3767 if (error)
3768 break;
3769 end = start;
3770 error = dnode_next_offset(dn,
3771 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
3772 zdb_nicenum(end - start, segsize, sizeof (segsize));
3773 (void) printf("\t\tsegment [%016llx, %016llx)"
3774 " size %5s\n", (u_longlong_t)start,
3775 (u_longlong_t)end, segsize);
3776 if (error)
3777 break;
3778 start = end;
3779 }
3780 }
3781
3782 out:
3783 if (db != NULL)
3784 dmu_buf_rele(db, FTAG);
3785 if (dnode_held)
3786 dnode_rele(dn, FTAG);
3787 }
3788
3789 static void
3790 count_dir_mos_objects(dsl_dir_t *dd)
3791 {
3792 mos_obj_refd(dd->dd_object);
3793 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
3794 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
3795 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
3796 mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
3797
3798 /*
3799 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
3800 * Ignore the references after the first one.
3801 */
3802 mos_obj_refd_multiple(dd->dd_crypto_obj);
3803 }
3804
3805 static void
3806 count_ds_mos_objects(dsl_dataset_t *ds)
3807 {
3808 mos_obj_refd(ds->ds_object);
3809 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
3810 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
3811 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
3812 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
3813 mos_obj_refd(ds->ds_bookmarks_obj);
3814
3815 if (!dsl_dataset_is_snapshot(ds)) {
3816 count_dir_mos_objects(ds->ds_dir);
3817 }
3818 }
3819
3820 static const char *const objset_types[DMU_OST_NUMTYPES] = {
3821 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
3822
3823 /*
3824 * Parse a string denoting a range of object IDs of the form
3825 * <start>[:<end>[:flags]], and store the results in zor.
3826 * Return 0 on success. On error, return 1 and update the msg
3827 * pointer to point to a descriptive error message.
3828 */
3829 static int
3830 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
3831 {
3832 uint64_t flags = 0;
3833 char *p, *s, *dup, *flagstr, *tmp = NULL;
3834 size_t len;
3835 int i;
3836 int rc = 0;
3837
3838 if (strchr(range, ':') == NULL) {
3839 zor->zor_obj_start = strtoull(range, &p, 0);
3840 if (*p != '\0') {
3841 *msg = "Invalid characters in object ID";
3842 rc = 1;
3843 }
3844 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
3845 zor->zor_obj_end = zor->zor_obj_start;
3846 return (rc);
3847 }
3848
3849 if (strchr(range, ':') == range) {
3850 *msg = "Invalid leading colon";
3851 rc = 1;
3852 return (rc);
3853 }
3854
3855 len = strlen(range);
3856 if (range[len - 1] == ':') {
3857 *msg = "Invalid trailing colon";
3858 rc = 1;
3859 return (rc);
3860 }
3861
3862 dup = strdup(range);
3863 s = strtok_r(dup, ":", &tmp);
3864 zor->zor_obj_start = strtoull(s, &p, 0);
3865
3866 if (*p != '\0') {
3867 *msg = "Invalid characters in start object ID";
3868 rc = 1;
3869 goto out;
3870 }
3871
3872 s = strtok_r(NULL, ":", &tmp);
3873 zor->zor_obj_end = strtoull(s, &p, 0);
3874
3875 if (*p != '\0') {
3876 *msg = "Invalid characters in end object ID";
3877 rc = 1;
3878 goto out;
3879 }
3880
3881 if (zor->zor_obj_start > zor->zor_obj_end) {
3882 *msg = "Start object ID may not exceed end object ID";
3883 rc = 1;
3884 goto out;
3885 }
3886
3887 s = strtok_r(NULL, ":", &tmp);
3888 if (s == NULL) {
3889 zor->zor_flags = ZOR_FLAG_ALL_TYPES;
3890 goto out;
3891 } else if (strtok_r(NULL, ":", &tmp) != NULL) {
3892 *msg = "Invalid colon-delimited field after flags";
3893 rc = 1;
3894 goto out;
3895 }
3896
3897 flagstr = s;
3898 for (i = 0; flagstr[i]; i++) {
3899 int bit;
3900 boolean_t negation = (flagstr[i] == '-');
3901
3902 if (negation) {
3903 i++;
3904 if (flagstr[i] == '\0') {
3905 *msg = "Invalid trailing negation operator";
3906 rc = 1;
3907 goto out;
3908 }
3909 }
3910 bit = flagbits[(uchar_t)flagstr[i]];
3911 if (bit == 0) {
3912 *msg = "Invalid flag";
3913 rc = 1;
3914 goto out;
3915 }
3916 if (negation)
3917 flags &= ~bit;
3918 else
3919 flags |= bit;
3920 }
3921 zor->zor_flags = flags;
3922
3923 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
3924 zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
3925
3926 out:
3927 free(dup);
3928 return (rc);
3929 }
3930
3931 static void
3932 dump_objset(objset_t *os)
3933 {
3934 dmu_objset_stats_t dds = { 0 };
3935 uint64_t object, object_count;
3936 uint64_t refdbytes, usedobjs, scratch;
3937 char numbuf[32];
3938 char blkbuf[BP_SPRINTF_LEN + 20];
3939 char osname[ZFS_MAX_DATASET_NAME_LEN];
3940 const char *type = "UNKNOWN";
3941 int verbosity = dump_opt['d'];
3942 boolean_t print_header;
3943 unsigned i;
3944 int error;
3945 uint64_t total_slots_used = 0;
3946 uint64_t max_slot_used = 0;
3947 uint64_t dnode_slots;
3948 uint64_t obj_start;
3949 uint64_t obj_end;
3950 uint64_t flags;
3951
3952 /* make sure nicenum has enough space */
3953 _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
3954
3955 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
3956 dmu_objset_fast_stat(os, &dds);
3957 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
3958
3959 print_header = B_TRUE;
3960
3961 if (dds.dds_type < DMU_OST_NUMTYPES)
3962 type = objset_types[dds.dds_type];
3963
3964 if (dds.dds_type == DMU_OST_META) {
3965 dds.dds_creation_txg = TXG_INITIAL;
3966 usedobjs = BP_GET_FILL(os->os_rootbp);
3967 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
3968 dd_used_bytes;
3969 } else {
3970 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
3971 }
3972
3973 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
3974
3975 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
3976
3977 if (verbosity >= 4) {
3978 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
3979 (void) snprintf_blkptr(blkbuf + strlen(blkbuf),
3980 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
3981 } else {
3982 blkbuf[0] = '\0';
3983 }
3984
3985 dmu_objset_name(os, osname);
3986
3987 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
3988 "%s, %llu objects%s%s\n",
3989 osname, type, (u_longlong_t)dmu_objset_id(os),
3990 (u_longlong_t)dds.dds_creation_txg,
3991 numbuf, (u_longlong_t)usedobjs, blkbuf,
3992 (dds.dds_inconsistent) ? " (inconsistent)" : "");
3993
3994 for (i = 0; i < zopt_object_args; i++) {
3995 obj_start = zopt_object_ranges[i].zor_obj_start;
3996 obj_end = zopt_object_ranges[i].zor_obj_end;
3997 flags = zopt_object_ranges[i].zor_flags;
3998
3999 object = obj_start;
4000 if (object == 0 || obj_start == obj_end)
4001 dump_object(os, object, verbosity, &print_header, NULL,
4002 flags);
4003 else
4004 object--;
4005
4006 while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
4007 object <= obj_end) {
4008 dump_object(os, object, verbosity, &print_header, NULL,
4009 flags);
4010 }
4011 }
4012
4013 if (zopt_object_args > 0) {
4014 (void) printf("\n");
4015 return;
4016 }
4017
4018 if (dump_opt['i'] != 0 || verbosity >= 2)
4019 dump_intent_log(dmu_objset_zil(os));
4020
4021 if (dmu_objset_ds(os) != NULL) {
4022 dsl_dataset_t *ds = dmu_objset_ds(os);
4023 dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
4024 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
4025 !dmu_objset_is_snapshot(os)) {
4026 dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
4027 if (verify_dd_livelist(os) != 0)
4028 fatal("livelist is incorrect");
4029 }
4030
4031 if (dsl_dataset_remap_deadlist_exists(ds)) {
4032 (void) printf("ds_remap_deadlist:\n");
4033 dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
4034 }
4035 count_ds_mos_objects(ds);
4036 }
4037
4038 if (dmu_objset_ds(os) != NULL)
4039 dump_bookmarks(os, verbosity);
4040
4041 if (verbosity < 2)
4042 return;
4043
4044 if (BP_IS_HOLE(os->os_rootbp))
4045 return;
4046
4047 dump_object(os, 0, verbosity, &print_header, NULL, 0);
4048 object_count = 0;
4049 if (DMU_USERUSED_DNODE(os) != NULL &&
4050 DMU_USERUSED_DNODE(os)->dn_type != 0) {
4051 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
4052 NULL, 0);
4053 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
4054 NULL, 0);
4055 }
4056
4057 if (DMU_PROJECTUSED_DNODE(os) != NULL &&
4058 DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
4059 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
4060 &print_header, NULL, 0);
4061
4062 object = 0;
4063 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
4064 dump_object(os, object, verbosity, &print_header, &dnode_slots,
4065 0);
4066 object_count++;
4067 total_slots_used += dnode_slots;
4068 max_slot_used = object + dnode_slots - 1;
4069 }
4070
4071 (void) printf("\n");
4072
4073 (void) printf(" Dnode slots:\n");
4074 (void) printf("\tTotal used: %10llu\n",
4075 (u_longlong_t)total_slots_used);
4076 (void) printf("\tMax used: %10llu\n",
4077 (u_longlong_t)max_slot_used);
4078 (void) printf("\tPercent empty: %10lf\n",
4079 (double)(max_slot_used - total_slots_used)*100 /
4080 (double)max_slot_used);
4081 (void) printf("\n");
4082
4083 if (error != ESRCH) {
4084 (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
4085 abort();
4086 }
4087
4088 ASSERT3U(object_count, ==, usedobjs);
4089
4090 if (leaked_objects != 0) {
4091 (void) printf("%d potentially leaked objects detected\n",
4092 leaked_objects);
4093 leaked_objects = 0;
4094 }
4095 }
4096
4097 static void
4098 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
4099 {
4100 time_t timestamp = ub->ub_timestamp;
4101
4102 (void) printf("%s", header ? header : "");
4103 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
4104 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
4105 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
4106 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
4107 (void) printf("\ttimestamp = %llu UTC = %s",
4108 (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
4109
4110 (void) printf("\tmmp_magic = %016llx\n",
4111 (u_longlong_t)ub->ub_mmp_magic);
4112 if (MMP_VALID(ub)) {
4113 (void) printf("\tmmp_delay = %0llu\n",
4114 (u_longlong_t)ub->ub_mmp_delay);
4115 if (MMP_SEQ_VALID(ub))
4116 (void) printf("\tmmp_seq = %u\n",
4117 (unsigned int) MMP_SEQ(ub));
4118 if (MMP_FAIL_INT_VALID(ub))
4119 (void) printf("\tmmp_fail = %u\n",
4120 (unsigned int) MMP_FAIL_INT(ub));
4121 if (MMP_INTERVAL_VALID(ub))
4122 (void) printf("\tmmp_write = %u\n",
4123 (unsigned int) MMP_INTERVAL(ub));
4124 /* After MMP_* to make summarize_uberblock_mmp cleaner */
4125 (void) printf("\tmmp_valid = %x\n",
4126 (unsigned int) ub->ub_mmp_config & 0xFF);
4127 }
4128
4129 if (dump_opt['u'] >= 4) {
4130 char blkbuf[BP_SPRINTF_LEN];
4131 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
4132 (void) printf("\trootbp = %s\n", blkbuf);
4133 }
4134 (void) printf("\tcheckpoint_txg = %llu\n",
4135 (u_longlong_t)ub->ub_checkpoint_txg);
4136 (void) printf("%s", footer ? footer : "");
4137 }
4138
4139 static void
4140 dump_config(spa_t *spa)
4141 {
4142 dmu_buf_t *db;
4143 size_t nvsize = 0;
4144 int error = 0;
4145
4146
4147 error = dmu_bonus_hold(spa->spa_meta_objset,
4148 spa->spa_config_object, FTAG, &db);
4149
4150 if (error == 0) {
4151 nvsize = *(uint64_t *)db->db_data;
4152 dmu_buf_rele(db, FTAG);
4153
4154 (void) printf("\nMOS Configuration:\n");
4155 dump_packed_nvlist(spa->spa_meta_objset,
4156 spa->spa_config_object, (void *)&nvsize, 1);
4157 } else {
4158 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
4159 (u_longlong_t)spa->spa_config_object, error);
4160 }
4161 }
4162
4163 static void
4164 dump_cachefile(const char *cachefile)
4165 {
4166 int fd;
4167 struct stat64 statbuf;
4168 char *buf;
4169 nvlist_t *config;
4170
4171 if ((fd = open64(cachefile, O_RDONLY)) < 0) {
4172 (void) printf("cannot open '%s': %s\n", cachefile,
4173 strerror(errno));
4174 exit(1);
4175 }
4176
4177 if (fstat64(fd, &statbuf) != 0) {
4178 (void) printf("failed to stat '%s': %s\n", cachefile,
4179 strerror(errno));
4180 exit(1);
4181 }
4182
4183 if ((buf = malloc(statbuf.st_size)) == NULL) {
4184 (void) fprintf(stderr, "failed to allocate %llu bytes\n",
4185 (u_longlong_t)statbuf.st_size);
4186 exit(1);
4187 }
4188
4189 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
4190 (void) fprintf(stderr, "failed to read %llu bytes\n",
4191 (u_longlong_t)statbuf.st_size);
4192 exit(1);
4193 }
4194
4195 (void) close(fd);
4196
4197 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
4198 (void) fprintf(stderr, "failed to unpack nvlist\n");
4199 exit(1);
4200 }
4201
4202 free(buf);
4203
4204 dump_nvlist(config, 0);
4205
4206 nvlist_free(config);
4207 }
4208
4209 /*
4210 * ZFS label nvlist stats
4211 */
4212 typedef struct zdb_nvl_stats {
4213 int zns_list_count;
4214 int zns_leaf_count;
4215 size_t zns_leaf_largest;
4216 size_t zns_leaf_total;
4217 nvlist_t *zns_string;
4218 nvlist_t *zns_uint64;
4219 nvlist_t *zns_boolean;
4220 } zdb_nvl_stats_t;
4221
4222 static void
4223 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
4224 {
4225 nvlist_t *list, **array;
4226 nvpair_t *nvp = NULL;
4227 const char *name;
4228 uint_t i, items;
4229
4230 stats->zns_list_count++;
4231
4232 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4233 name = nvpair_name(nvp);
4234
4235 switch (nvpair_type(nvp)) {
4236 case DATA_TYPE_STRING:
4237 fnvlist_add_string(stats->zns_string, name,
4238 fnvpair_value_string(nvp));
4239 break;
4240 case DATA_TYPE_UINT64:
4241 fnvlist_add_uint64(stats->zns_uint64, name,
4242 fnvpair_value_uint64(nvp));
4243 break;
4244 case DATA_TYPE_BOOLEAN:
4245 fnvlist_add_boolean(stats->zns_boolean, name);
4246 break;
4247 case DATA_TYPE_NVLIST:
4248 if (nvpair_value_nvlist(nvp, &list) == 0)
4249 collect_nvlist_stats(list, stats);
4250 break;
4251 case DATA_TYPE_NVLIST_ARRAY:
4252 if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
4253 break;
4254
4255 for (i = 0; i < items; i++) {
4256 collect_nvlist_stats(array[i], stats);
4257
4258 /* collect stats on leaf vdev */
4259 if (strcmp(name, "children") == 0) {
4260 size_t size;
4261
4262 (void) nvlist_size(array[i], &size,
4263 NV_ENCODE_XDR);
4264 stats->zns_leaf_total += size;
4265 if (size > stats->zns_leaf_largest)
4266 stats->zns_leaf_largest = size;
4267 stats->zns_leaf_count++;
4268 }
4269 }
4270 break;
4271 default:
4272 (void) printf("skip type %d!\n", (int)nvpair_type(nvp));
4273 }
4274 }
4275 }
4276
4277 static void
4278 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
4279 {
4280 zdb_nvl_stats_t stats = { 0 };
4281 size_t size, sum = 0, total;
4282 size_t noise;
4283
4284 /* requires nvlist with non-unique names for stat collection */
4285 VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
4286 VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
4287 VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
4288 VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
4289
4290 (void) printf("\n\nZFS Label NVList Config Stats:\n");
4291
4292 VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
4293 (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n",
4294 (int)total, (int)(cap - total), 100.0 * total / cap);
4295
4296 collect_nvlist_stats(nvl, &stats);
4297
4298 VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
4299 size -= noise;
4300 sum += size;
4301 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
4302 (int)fnvlist_num_pairs(stats.zns_uint64),
4303 (int)size, 100.0 * size / total);
4304
4305 VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
4306 size -= noise;
4307 sum += size;
4308 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
4309 (int)fnvlist_num_pairs(stats.zns_string),
4310 (int)size, 100.0 * size / total);
4311
4312 VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
4313 size -= noise;
4314 sum += size;
4315 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
4316 (int)fnvlist_num_pairs(stats.zns_boolean),
4317 (int)size, 100.0 * size / total);
4318
4319 size = total - sum; /* treat remainder as nvlist overhead */
4320 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
4321 stats.zns_list_count, (int)size, 100.0 * size / total);
4322
4323 if (stats.zns_leaf_count > 0) {
4324 size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
4325
4326 (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
4327 stats.zns_leaf_count, (int)average);
4328 (void) printf("%24d bytes largest\n",
4329 (int)stats.zns_leaf_largest);
4330
4331 if (dump_opt['l'] >= 3 && average > 0)
4332 (void) printf(" space for %d additional leaf vdevs\n",
4333 (int)((cap - total) / average));
4334 }
4335 (void) printf("\n");
4336
4337 nvlist_free(stats.zns_string);
4338 nvlist_free(stats.zns_uint64);
4339 nvlist_free(stats.zns_boolean);
4340 }
4341
4342 typedef struct cksum_record {
4343 zio_cksum_t cksum;
4344 boolean_t labels[VDEV_LABELS];
4345 avl_node_t link;
4346 } cksum_record_t;
4347
4348 static int
4349 cksum_record_compare(const void *x1, const void *x2)
4350 {
4351 const cksum_record_t *l = (cksum_record_t *)x1;
4352 const cksum_record_t *r = (cksum_record_t *)x2;
4353 int arraysize = ARRAY_SIZE(l->cksum.zc_word);
4354 int difference = 0;
4355
4356 for (int i = 0; i < arraysize; i++) {
4357 difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
4358 if (difference)
4359 break;
4360 }
4361
4362 return (difference);
4363 }
4364
4365 static cksum_record_t *
4366 cksum_record_alloc(zio_cksum_t *cksum, int l)
4367 {
4368 cksum_record_t *rec;
4369
4370 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
4371 rec->cksum = *cksum;
4372 rec->labels[l] = B_TRUE;
4373
4374 return (rec);
4375 }
4376
4377 static cksum_record_t *
4378 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
4379 {
4380 cksum_record_t lookup = { .cksum = *cksum };
4381 avl_index_t where;
4382
4383 return (avl_find(tree, &lookup, &where));
4384 }
4385
4386 static cksum_record_t *
4387 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
4388 {
4389 cksum_record_t *rec;
4390
4391 rec = cksum_record_lookup(tree, cksum);
4392 if (rec) {
4393 rec->labels[l] = B_TRUE;
4394 } else {
4395 rec = cksum_record_alloc(cksum, l);
4396 avl_add(tree, rec);
4397 }
4398
4399 return (rec);
4400 }
4401
4402 static int
4403 first_label(cksum_record_t *rec)
4404 {
4405 for (int i = 0; i < VDEV_LABELS; i++)
4406 if (rec->labels[i])
4407 return (i);
4408
4409 return (-1);
4410 }
4411
4412 static void
4413 print_label_numbers(const char *prefix, const cksum_record_t *rec)
4414 {
4415 fputs(prefix, stdout);
4416 for (int i = 0; i < VDEV_LABELS; i++)
4417 if (rec->labels[i] == B_TRUE)
4418 printf("%d ", i);
4419 putchar('\n');
4420 }
4421
4422 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
4423
4424 typedef struct zdb_label {
4425 vdev_label_t label;
4426 uint64_t label_offset;
4427 nvlist_t *config_nv;
4428 cksum_record_t *config;
4429 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
4430 boolean_t header_printed;
4431 boolean_t read_failed;
4432 boolean_t cksum_valid;
4433 } zdb_label_t;
4434
4435 static void
4436 print_label_header(zdb_label_t *label, int l)
4437 {
4438
4439 if (dump_opt['q'])
4440 return;
4441
4442 if (label->header_printed == B_TRUE)
4443 return;
4444
4445 (void) printf("------------------------------------\n");
4446 (void) printf("LABEL %d %s\n", l,
4447 label->cksum_valid ? "" : "(Bad label cksum)");
4448 (void) printf("------------------------------------\n");
4449
4450 label->header_printed = B_TRUE;
4451 }
4452
4453 static void
4454 print_l2arc_header(void)
4455 {
4456 (void) printf("------------------------------------\n");
4457 (void) printf("L2ARC device header\n");
4458 (void) printf("------------------------------------\n");
4459 }
4460
4461 static void
4462 print_l2arc_log_blocks(void)
4463 {
4464 (void) printf("------------------------------------\n");
4465 (void) printf("L2ARC device log blocks\n");
4466 (void) printf("------------------------------------\n");
4467 }
4468
4469 static void
4470 dump_l2arc_log_entries(uint64_t log_entries,
4471 l2arc_log_ent_phys_t *le, uint64_t i)
4472 {
4473 for (int j = 0; j < log_entries; j++) {
4474 dva_t dva = le[j].le_dva;
4475 (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
4476 "vdev: %llu, offset: %llu\n",
4477 (u_longlong_t)i, j + 1,
4478 (u_longlong_t)DVA_GET_ASIZE(&dva),
4479 (u_longlong_t)DVA_GET_VDEV(&dva),
4480 (u_longlong_t)DVA_GET_OFFSET(&dva));
4481 (void) printf("|\t\t\t\tbirth: %llu\n",
4482 (u_longlong_t)le[j].le_birth);
4483 (void) printf("|\t\t\t\tlsize: %llu\n",
4484 (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
4485 (void) printf("|\t\t\t\tpsize: %llu\n",
4486 (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
4487 (void) printf("|\t\t\t\tcompr: %llu\n",
4488 (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
4489 (void) printf("|\t\t\t\tcomplevel: %llu\n",
4490 (u_longlong_t)(&le[j])->le_complevel);
4491 (void) printf("|\t\t\t\ttype: %llu\n",
4492 (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
4493 (void) printf("|\t\t\t\tprotected: %llu\n",
4494 (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
4495 (void) printf("|\t\t\t\tprefetch: %llu\n",
4496 (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
4497 (void) printf("|\t\t\t\taddress: %llu\n",
4498 (u_longlong_t)le[j].le_daddr);
4499 (void) printf("|\t\t\t\tARC state: %llu\n",
4500 (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
4501 (void) printf("|\n");
4502 }
4503 (void) printf("\n");
4504 }
4505
4506 static void
4507 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
4508 {
4509 (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
4510 (void) printf("|\t\tpayload_asize: %llu\n",
4511 (u_longlong_t)lbps->lbp_payload_asize);
4512 (void) printf("|\t\tpayload_start: %llu\n",
4513 (u_longlong_t)lbps->lbp_payload_start);
4514 (void) printf("|\t\tlsize: %llu\n",
4515 (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
4516 (void) printf("|\t\tasize: %llu\n",
4517 (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
4518 (void) printf("|\t\tcompralgo: %llu\n",
4519 (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
4520 (void) printf("|\t\tcksumalgo: %llu\n",
4521 (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
4522 (void) printf("|\n\n");
4523 }
4524
4525 static void
4526 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
4527 l2arc_dev_hdr_phys_t *rebuild)
4528 {
4529 l2arc_log_blk_phys_t this_lb;
4530 uint64_t asize;
4531 l2arc_log_blkptr_t lbps[2];
4532 abd_t *abd;
4533 zio_cksum_t cksum;
4534 int failed = 0;
4535 l2arc_dev_t dev;
4536
4537 if (!dump_opt['q'])
4538 print_l2arc_log_blocks();
4539 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
4540
4541 dev.l2ad_evict = l2dhdr->dh_evict;
4542 dev.l2ad_start = l2dhdr->dh_start;
4543 dev.l2ad_end = l2dhdr->dh_end;
4544
4545 if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
4546 /* no log blocks to read */
4547 if (!dump_opt['q']) {
4548 (void) printf("No log blocks to read\n");
4549 (void) printf("\n");
4550 }
4551 return;
4552 } else {
4553 dev.l2ad_hand = lbps[0].lbp_daddr +
4554 L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
4555 }
4556
4557 dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
4558
4559 for (;;) {
4560 if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
4561 break;
4562
4563 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
4564 asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
4565 if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
4566 if (!dump_opt['q']) {
4567 (void) printf("Error while reading next log "
4568 "block\n\n");
4569 }
4570 break;
4571 }
4572
4573 fletcher_4_native_varsize(&this_lb, asize, &cksum);
4574 if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
4575 failed++;
4576 if (!dump_opt['q']) {
4577 (void) printf("Invalid cksum\n");
4578 dump_l2arc_log_blkptr(&lbps[0]);
4579 }
4580 break;
4581 }
4582
4583 switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
4584 case ZIO_COMPRESS_OFF:
4585 break;
4586 default:
4587 abd = abd_alloc_for_io(asize, B_TRUE);
4588 abd_copy_from_buf_off(abd, &this_lb, 0, asize);
4589 if (zio_decompress_data(L2BLK_GET_COMPRESS(
4590 (&lbps[0])->lbp_prop), abd, &this_lb,
4591 asize, sizeof (this_lb), NULL) != 0) {
4592 (void) printf("L2ARC block decompression "
4593 "failed\n");
4594 abd_free(abd);
4595 goto out;
4596 }
4597 abd_free(abd);
4598 break;
4599 }
4600
4601 if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
4602 byteswap_uint64_array(&this_lb, sizeof (this_lb));
4603 if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
4604 if (!dump_opt['q'])
4605 (void) printf("Invalid log block magic\n\n");
4606 break;
4607 }
4608
4609 rebuild->dh_lb_count++;
4610 rebuild->dh_lb_asize += asize;
4611 if (dump_opt['l'] > 1 && !dump_opt['q']) {
4612 (void) printf("lb[%4llu]\tmagic: %llu\n",
4613 (u_longlong_t)rebuild->dh_lb_count,
4614 (u_longlong_t)this_lb.lb_magic);
4615 dump_l2arc_log_blkptr(&lbps[0]);
4616 }
4617
4618 if (dump_opt['l'] > 2 && !dump_opt['q'])
4619 dump_l2arc_log_entries(l2dhdr->dh_log_entries,
4620 this_lb.lb_entries,
4621 rebuild->dh_lb_count);
4622
4623 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
4624 lbps[0].lbp_payload_start, dev.l2ad_evict) &&
4625 !dev.l2ad_first)
4626 break;
4627
4628 lbps[0] = lbps[1];
4629 lbps[1] = this_lb.lb_prev_lbp;
4630 }
4631 out:
4632 if (!dump_opt['q']) {
4633 (void) printf("log_blk_count:\t %llu with valid cksum\n",
4634 (u_longlong_t)rebuild->dh_lb_count);
4635 (void) printf("\t\t %d with invalid cksum\n", failed);
4636 (void) printf("log_blk_asize:\t %llu\n\n",
4637 (u_longlong_t)rebuild->dh_lb_asize);
4638 }
4639 }
4640
4641 static int
4642 dump_l2arc_header(int fd)
4643 {
4644 l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
4645 int error = B_FALSE;
4646
4647 if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
4648 VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
4649 error = B_TRUE;
4650 } else {
4651 if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
4652 byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
4653
4654 if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
4655 error = B_TRUE;
4656 }
4657
4658 if (error) {
4659 (void) printf("L2ARC device header not found\n\n");
4660 /* Do not return an error here for backward compatibility */
4661 return (0);
4662 } else if (!dump_opt['q']) {
4663 print_l2arc_header();
4664
4665 (void) printf(" magic: %llu\n",
4666 (u_longlong_t)l2dhdr.dh_magic);
4667 (void) printf(" version: %llu\n",
4668 (u_longlong_t)l2dhdr.dh_version);
4669 (void) printf(" pool_guid: %llu\n",
4670 (u_longlong_t)l2dhdr.dh_spa_guid);
4671 (void) printf(" flags: %llu\n",
4672 (u_longlong_t)l2dhdr.dh_flags);
4673 (void) printf(" start_lbps[0]: %llu\n",
4674 (u_longlong_t)
4675 l2dhdr.dh_start_lbps[0].lbp_daddr);
4676 (void) printf(" start_lbps[1]: %llu\n",
4677 (u_longlong_t)
4678 l2dhdr.dh_start_lbps[1].lbp_daddr);
4679 (void) printf(" log_blk_ent: %llu\n",
4680 (u_longlong_t)l2dhdr.dh_log_entries);
4681 (void) printf(" start: %llu\n",
4682 (u_longlong_t)l2dhdr.dh_start);
4683 (void) printf(" end: %llu\n",
4684 (u_longlong_t)l2dhdr.dh_end);
4685 (void) printf(" evict: %llu\n",
4686 (u_longlong_t)l2dhdr.dh_evict);
4687 (void) printf(" lb_asize_refcount: %llu\n",
4688 (u_longlong_t)l2dhdr.dh_lb_asize);
4689 (void) printf(" lb_count_refcount: %llu\n",
4690 (u_longlong_t)l2dhdr.dh_lb_count);
4691 (void) printf(" trim_action_time: %llu\n",
4692 (u_longlong_t)l2dhdr.dh_trim_action_time);
4693 (void) printf(" trim_state: %llu\n\n",
4694 (u_longlong_t)l2dhdr.dh_trim_state);
4695 }
4696
4697 dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
4698 /*
4699 * The total aligned size of log blocks and the number of log blocks
4700 * reported in the header of the device may be less than what zdb
4701 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
4702 * This happens because dump_l2arc_log_blocks() lacks the memory
4703 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
4704 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
4705 * and dh_lb_count will be lower to begin with than what exists on the
4706 * device. This is normal and zdb should not exit with an error. The
4707 * opposite case should never happen though, the values reported in the
4708 * header should never be higher than what dump_l2arc_log_blocks() and
4709 * l2arc_rebuild() report. If this happens there is a leak in the
4710 * accounting of log blocks.
4711 */
4712 if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
4713 l2dhdr.dh_lb_count > rebuild.dh_lb_count)
4714 return (1);
4715
4716 return (0);
4717 }
4718
4719 static void
4720 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
4721 {
4722 if (dump_opt['q'])
4723 return;
4724
4725 if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
4726 return;
4727
4728 print_label_header(label, l);
4729 dump_nvlist(label->config_nv, 4);
4730 print_label_numbers(" labels = ", label->config);
4731
4732 if (dump_opt['l'] >= 2)
4733 dump_nvlist_stats(label->config_nv, buflen);
4734 }
4735
4736 #define ZDB_MAX_UB_HEADER_SIZE 32
4737
4738 static void
4739 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
4740 {
4741
4742 vdev_t vd;
4743 char header[ZDB_MAX_UB_HEADER_SIZE];
4744
4745 vd.vdev_ashift = ashift;
4746 vd.vdev_top = &vd;
4747
4748 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
4749 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
4750 uberblock_t *ub = (void *)((char *)&label->label + uoff);
4751 cksum_record_t *rec = label->uberblocks[i];
4752
4753 if (rec == NULL) {
4754 if (dump_opt['u'] >= 2) {
4755 print_label_header(label, label_num);
4756 (void) printf(" Uberblock[%d] invalid\n", i);
4757 }
4758 continue;
4759 }
4760
4761 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
4762 continue;
4763
4764 if ((dump_opt['u'] < 4) &&
4765 (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
4766 (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
4767 continue;
4768
4769 print_label_header(label, label_num);
4770 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
4771 " Uberblock[%d]\n", i);
4772 dump_uberblock(ub, header, "");
4773 print_label_numbers(" labels = ", rec);
4774 }
4775 }
4776
4777 static char curpath[PATH_MAX];
4778
4779 /*
4780 * Iterate through the path components, recursively passing
4781 * current one's obj and remaining path until we find the obj
4782 * for the last one.
4783 */
4784 static int
4785 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
4786 {
4787 int err;
4788 boolean_t header = B_TRUE;
4789 uint64_t child_obj;
4790 char *s;
4791 dmu_buf_t *db;
4792 dmu_object_info_t doi;
4793
4794 if ((s = strchr(name, '/')) != NULL)
4795 *s = '\0';
4796 err = zap_lookup(os, obj, name, 8, 1, &child_obj);
4797
4798 (void) strlcat(curpath, name, sizeof (curpath));
4799
4800 if (err != 0) {
4801 (void) fprintf(stderr, "failed to lookup %s: %s\n",
4802 curpath, strerror(err));
4803 return (err);
4804 }
4805
4806 child_obj = ZFS_DIRENT_OBJ(child_obj);
4807 err = sa_buf_hold(os, child_obj, FTAG, &db);
4808 if (err != 0) {
4809 (void) fprintf(stderr,
4810 "failed to get SA dbuf for obj %llu: %s\n",
4811 (u_longlong_t)child_obj, strerror(err));
4812 return (EINVAL);
4813 }
4814 dmu_object_info_from_db(db, &doi);
4815 sa_buf_rele(db, FTAG);
4816
4817 if (doi.doi_bonus_type != DMU_OT_SA &&
4818 doi.doi_bonus_type != DMU_OT_ZNODE) {
4819 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
4820 doi.doi_bonus_type, (u_longlong_t)child_obj);
4821 return (EINVAL);
4822 }
4823
4824 if (dump_opt['v'] > 6) {
4825 (void) printf("obj=%llu %s type=%d bonustype=%d\n",
4826 (u_longlong_t)child_obj, curpath, doi.doi_type,
4827 doi.doi_bonus_type);
4828 }
4829
4830 (void) strlcat(curpath, "/", sizeof (curpath));
4831
4832 switch (doi.doi_type) {
4833 case DMU_OT_DIRECTORY_CONTENTS:
4834 if (s != NULL && *(s + 1) != '\0')
4835 return (dump_path_impl(os, child_obj, s + 1, retobj));
4836 zfs_fallthrough;
4837 case DMU_OT_PLAIN_FILE_CONTENTS:
4838 if (retobj != NULL) {
4839 *retobj = child_obj;
4840 } else {
4841 dump_object(os, child_obj, dump_opt['v'], &header,
4842 NULL, 0);
4843 }
4844 return (0);
4845 default:
4846 (void) fprintf(stderr, "object %llu has non-file/directory "
4847 "type %d\n", (u_longlong_t)obj, doi.doi_type);
4848 break;
4849 }
4850
4851 return (EINVAL);
4852 }
4853
4854 /*
4855 * Dump the blocks for the object specified by path inside the dataset.
4856 */
4857 static int
4858 dump_path(char *ds, char *path, uint64_t *retobj)
4859 {
4860 int err;
4861 objset_t *os;
4862 uint64_t root_obj;
4863
4864 err = open_objset(ds, FTAG, &os);
4865 if (err != 0)
4866 return (err);
4867
4868 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
4869 if (err != 0) {
4870 (void) fprintf(stderr, "can't lookup root znode: %s\n",
4871 strerror(err));
4872 close_objset(os, FTAG);
4873 return (EINVAL);
4874 }
4875
4876 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
4877
4878 err = dump_path_impl(os, root_obj, path, retobj);
4879
4880 close_objset(os, FTAG);
4881 return (err);
4882 }
4883
4884 static int
4885 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
4886 {
4887 const char *p = (const char *)buf;
4888 ssize_t nwritten;
4889
4890 (void) os;
4891 (void) arg;
4892
4893 /* Write the data out, handling short writes and signals. */
4894 while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
4895 if (nwritten < 0) {
4896 if (errno == EINTR)
4897 continue;
4898 return (errno);
4899 }
4900 p += nwritten;
4901 len -= nwritten;
4902 }
4903
4904 return (0);
4905 }
4906
4907 static void
4908 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
4909 {
4910 boolean_t embed = B_FALSE;
4911 boolean_t large_block = B_FALSE;
4912 boolean_t compress = B_FALSE;
4913 boolean_t raw = B_FALSE;
4914
4915 const char *c;
4916 for (c = flagstr; c != NULL && *c != '\0'; c++) {
4917 switch (*c) {
4918 case 'e':
4919 embed = B_TRUE;
4920 break;
4921 case 'L':
4922 large_block = B_TRUE;
4923 break;
4924 case 'c':
4925 compress = B_TRUE;
4926 break;
4927 case 'w':
4928 raw = B_TRUE;
4929 break;
4930 default:
4931 fprintf(stderr, "dump_backup: invalid flag "
4932 "'%c'\n", *c);
4933 return;
4934 }
4935 }
4936
4937 if (isatty(STDOUT_FILENO)) {
4938 fprintf(stderr, "dump_backup: stream cannot be written "
4939 "to a terminal\n");
4940 return;
4941 }
4942
4943 offset_t off = 0;
4944 dmu_send_outparams_t out = {
4945 .dso_outfunc = dump_backup_bytes,
4946 .dso_dryrun = B_FALSE,
4947 };
4948
4949 int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
4950 large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
4951 &off, &out);
4952 if (err != 0) {
4953 fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
4954 strerror(err));
4955 return;
4956 }
4957 }
4958
4959 static int
4960 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
4961 {
4962 int err = 0;
4963 uint64_t size, readsize, oursize, offset;
4964 ssize_t writesize;
4965 sa_handle_t *hdl;
4966
4967 (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
4968 destfile);
4969
4970 VERIFY3P(os, ==, sa_os);
4971 if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
4972 (void) printf("Failed to get handle for SA znode\n");
4973 return (err);
4974 }
4975 if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
4976 (void) sa_handle_destroy(hdl);
4977 return (err);
4978 }
4979 (void) sa_handle_destroy(hdl);
4980
4981 (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
4982 size);
4983 if (size == 0) {
4984 return (EINVAL);
4985 }
4986
4987 int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
4988 if (fd == -1)
4989 return (errno);
4990 /*
4991 * We cap the size at 1 mebibyte here to prevent
4992 * allocation failures and nigh-infinite printing if the
4993 * object is extremely large.
4994 */
4995 oursize = MIN(size, 1 << 20);
4996 offset = 0;
4997 char *buf = kmem_alloc(oursize, KM_NOSLEEP);
4998 if (buf == NULL) {
4999 (void) close(fd);
5000 return (ENOMEM);
5001 }
5002
5003 while (offset < size) {
5004 readsize = MIN(size - offset, 1 << 20);
5005 err = dmu_read(os, srcobj, offset, readsize, buf, 0);
5006 if (err != 0) {
5007 (void) printf("got error %u from dmu_read\n", err);
5008 kmem_free(buf, oursize);
5009 (void) close(fd);
5010 return (err);
5011 }
5012 if (dump_opt['v'] > 3) {
5013 (void) printf("Read offset=%" PRIu64 " size=%" PRIu64
5014 " error=%d\n", offset, readsize, err);
5015 }
5016
5017 writesize = write(fd, buf, readsize);
5018 if (writesize < 0) {
5019 err = errno;
5020 break;
5021 } else if (writesize != readsize) {
5022 /* Incomplete write */
5023 (void) fprintf(stderr, "Short write, only wrote %llu of"
5024 " %" PRIu64 " bytes, exiting...\n",
5025 (u_longlong_t)writesize, readsize);
5026 break;
5027 }
5028
5029 offset += readsize;
5030 }
5031
5032 (void) close(fd);
5033
5034 if (buf != NULL)
5035 kmem_free(buf, oursize);
5036
5037 return (err);
5038 }
5039
5040 static boolean_t
5041 label_cksum_valid(vdev_label_t *label, uint64_t offset)
5042 {
5043 zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
5044 zio_cksum_t expected_cksum;
5045 zio_cksum_t actual_cksum;
5046 zio_cksum_t verifier;
5047 zio_eck_t *eck;
5048 int byteswap;
5049
5050 void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
5051 eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
5052
5053 offset += offsetof(vdev_label_t, vl_vdev_phys);
5054 ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
5055
5056 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
5057 if (byteswap)
5058 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
5059
5060 expected_cksum = eck->zec_cksum;
5061 eck->zec_cksum = verifier;
5062
5063 abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
5064 ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
5065 abd_free(abd);
5066
5067 if (byteswap)
5068 byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
5069
5070 if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
5071 return (B_TRUE);
5072
5073 return (B_FALSE);
5074 }
5075
5076 static int
5077 dump_label(const char *dev)
5078 {
5079 char path[MAXPATHLEN];
5080 zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
5081 uint64_t psize, ashift, l2cache;
5082 struct stat64 statbuf;
5083 boolean_t config_found = B_FALSE;
5084 boolean_t error = B_FALSE;
5085 boolean_t read_l2arc_header = B_FALSE;
5086 avl_tree_t config_tree;
5087 avl_tree_t uberblock_tree;
5088 void *node, *cookie;
5089 int fd;
5090
5091 /*
5092 * Check if we were given absolute path and use it as is.
5093 * Otherwise if the provided vdev name doesn't point to a file,
5094 * try prepending expected disk paths and partition numbers.
5095 */
5096 (void) strlcpy(path, dev, sizeof (path));
5097 if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
5098 int error;
5099
5100 error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
5101 if (error == 0 && zfs_dev_is_whole_disk(path)) {
5102 if (zfs_append_partition(path, MAXPATHLEN) == -1)
5103 error = ENOENT;
5104 }
5105
5106 if (error || (stat64(path, &statbuf) != 0)) {
5107 (void) printf("failed to find device %s, try "
5108 "specifying absolute path instead\n", dev);
5109 return (1);
5110 }
5111 }
5112
5113 if ((fd = open64(path, O_RDONLY)) < 0) {
5114 (void) printf("cannot open '%s': %s\n", path, strerror(errno));
5115 exit(1);
5116 }
5117
5118 if (fstat64_blk(fd, &statbuf) != 0) {
5119 (void) printf("failed to stat '%s': %s\n", path,
5120 strerror(errno));
5121 (void) close(fd);
5122 exit(1);
5123 }
5124
5125 if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
5126 (void) printf("failed to invalidate cache '%s' : %s\n", path,
5127 strerror(errno));
5128
5129 avl_create(&config_tree, cksum_record_compare,
5130 sizeof (cksum_record_t), offsetof(cksum_record_t, link));
5131 avl_create(&uberblock_tree, cksum_record_compare,
5132 sizeof (cksum_record_t), offsetof(cksum_record_t, link));
5133
5134 psize = statbuf.st_size;
5135 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
5136 ashift = SPA_MINBLOCKSHIFT;
5137
5138 /*
5139 * 1. Read the label from disk
5140 * 2. Verify label cksum
5141 * 3. Unpack the configuration and insert in config tree.
5142 * 4. Traverse all uberblocks and insert in uberblock tree.
5143 */
5144 for (int l = 0; l < VDEV_LABELS; l++) {
5145 zdb_label_t *label = &labels[l];
5146 char *buf = label->label.vl_vdev_phys.vp_nvlist;
5147 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
5148 nvlist_t *config;
5149 cksum_record_t *rec;
5150 zio_cksum_t cksum;
5151 vdev_t vd;
5152
5153 label->label_offset = vdev_label_offset(psize, l, 0);
5154
5155 if (pread64(fd, &label->label, sizeof (label->label),
5156 label->label_offset) != sizeof (label->label)) {
5157 if (!dump_opt['q'])
5158 (void) printf("failed to read label %d\n", l);
5159 label->read_failed = B_TRUE;
5160 error = B_TRUE;
5161 continue;
5162 }
5163
5164 label->read_failed = B_FALSE;
5165 label->cksum_valid = label_cksum_valid(&label->label,
5166 label->label_offset);
5167
5168 if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
5169 nvlist_t *vdev_tree = NULL;
5170 size_t size;
5171
5172 if ((nvlist_lookup_nvlist(config,
5173 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
5174 (nvlist_lookup_uint64(vdev_tree,
5175 ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
5176 ashift = SPA_MINBLOCKSHIFT;
5177
5178 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
5179 size = buflen;
5180
5181 /* If the device is a cache device clear the header. */
5182 if (!read_l2arc_header) {
5183 if (nvlist_lookup_uint64(config,
5184 ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
5185 l2cache == POOL_STATE_L2CACHE) {
5186 read_l2arc_header = B_TRUE;
5187 }
5188 }
5189
5190 fletcher_4_native_varsize(buf, size, &cksum);
5191 rec = cksum_record_insert(&config_tree, &cksum, l);
5192
5193 label->config = rec;
5194 label->config_nv = config;
5195 config_found = B_TRUE;
5196 } else {
5197 error = B_TRUE;
5198 }
5199
5200 vd.vdev_ashift = ashift;
5201 vd.vdev_top = &vd;
5202
5203 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
5204 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
5205 uberblock_t *ub = (void *)((char *)label + uoff);
5206
5207 if (uberblock_verify(ub))
5208 continue;
5209
5210 fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
5211 rec = cksum_record_insert(&uberblock_tree, &cksum, l);
5212
5213 label->uberblocks[i] = rec;
5214 }
5215 }
5216
5217 /*
5218 * Dump the label and uberblocks.
5219 */
5220 for (int l = 0; l < VDEV_LABELS; l++) {
5221 zdb_label_t *label = &labels[l];
5222 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
5223
5224 if (label->read_failed == B_TRUE)
5225 continue;
5226
5227 if (label->config_nv) {
5228 dump_config_from_label(label, buflen, l);
5229 } else {
5230 if (!dump_opt['q'])
5231 (void) printf("failed to unpack label %d\n", l);
5232 }
5233
5234 if (dump_opt['u'])
5235 dump_label_uberblocks(label, ashift, l);
5236
5237 nvlist_free(label->config_nv);
5238 }
5239
5240 /*
5241 * Dump the L2ARC header, if existent.
5242 */
5243 if (read_l2arc_header)
5244 error |= dump_l2arc_header(fd);
5245
5246 cookie = NULL;
5247 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
5248 umem_free(node, sizeof (cksum_record_t));
5249
5250 cookie = NULL;
5251 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
5252 umem_free(node, sizeof (cksum_record_t));
5253
5254 avl_destroy(&config_tree);
5255 avl_destroy(&uberblock_tree);
5256
5257 (void) close(fd);
5258
5259 return (config_found == B_FALSE ? 2 :
5260 (error == B_TRUE ? 1 : 0));
5261 }
5262
5263 static uint64_t dataset_feature_count[SPA_FEATURES];
5264 static uint64_t global_feature_count[SPA_FEATURES];
5265 static uint64_t remap_deadlist_count = 0;
5266
5267 static int
5268 dump_one_objset(const char *dsname, void *arg)
5269 {
5270 (void) arg;
5271 int error;
5272 objset_t *os;
5273 spa_feature_t f;
5274
5275 error = open_objset(dsname, FTAG, &os);
5276 if (error != 0)
5277 return (0);
5278
5279 for (f = 0; f < SPA_FEATURES; f++) {
5280 if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
5281 continue;
5282 ASSERT(spa_feature_table[f].fi_flags &
5283 ZFEATURE_FLAG_PER_DATASET);
5284 dataset_feature_count[f]++;
5285 }
5286
5287 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
5288 remap_deadlist_count++;
5289 }
5290
5291 for (dsl_bookmark_node_t *dbn =
5292 avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
5293 dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
5294 mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
5295 if (dbn->dbn_phys.zbm_redaction_obj != 0)
5296 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++;
5297 if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
5298 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
5299 }
5300
5301 if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
5302 !dmu_objset_is_snapshot(os)) {
5303 global_feature_count[SPA_FEATURE_LIVELIST]++;
5304 }
5305
5306 dump_objset(os);
5307 close_objset(os, FTAG);
5308 fuid_table_destroy();
5309 return (0);
5310 }
5311
5312 /*
5313 * Block statistics.
5314 */
5315 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
5316 typedef struct zdb_blkstats {
5317 uint64_t zb_asize;
5318 uint64_t zb_lsize;
5319 uint64_t zb_psize;
5320 uint64_t zb_count;
5321 uint64_t zb_gangs;
5322 uint64_t zb_ditto_samevdev;
5323 uint64_t zb_ditto_same_ms;
5324 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
5325 } zdb_blkstats_t;
5326
5327 /*
5328 * Extended object types to report deferred frees and dedup auto-ditto blocks.
5329 */
5330 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
5331 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
5332 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
5333 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
5334
5335 static const char *zdb_ot_extname[] = {
5336 "deferred free",
5337 "dedup ditto",
5338 "other",
5339 "Total",
5340 };
5341
5342 #define ZB_TOTAL DN_MAX_LEVELS
5343 #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
5344
5345 typedef struct zdb_cb {
5346 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
5347 uint64_t zcb_removing_size;
5348 uint64_t zcb_checkpoint_size;
5349 uint64_t zcb_dedup_asize;
5350 uint64_t zcb_dedup_blocks;
5351 uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
5352 uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
5353 uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
5354 uint64_t zcb_psize_len[SPA_MAX_FOR_16M];
5355 uint64_t zcb_lsize_len[SPA_MAX_FOR_16M];
5356 uint64_t zcb_asize_len[SPA_MAX_FOR_16M];
5357 uint64_t zcb_psize_total;
5358 uint64_t zcb_lsize_total;
5359 uint64_t zcb_asize_total;
5360 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
5361 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
5362 [BPE_PAYLOAD_SIZE + 1];
5363 uint64_t zcb_start;
5364 hrtime_t zcb_lastprint;
5365 uint64_t zcb_totalasize;
5366 uint64_t zcb_errors[256];
5367 int zcb_readfails;
5368 int zcb_haderrors;
5369 spa_t *zcb_spa;
5370 uint32_t **zcb_vd_obsolete_counts;
5371 } zdb_cb_t;
5372
5373 /* test if two DVA offsets from same vdev are within the same metaslab */
5374 static boolean_t
5375 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
5376 {
5377 vdev_t *vd = vdev_lookup_top(spa, vdev);
5378 uint64_t ms_shift = vd->vdev_ms_shift;
5379
5380 return ((off1 >> ms_shift) == (off2 >> ms_shift));
5381 }
5382
5383 /*
5384 * Used to simplify reporting of the histogram data.
5385 */
5386 typedef struct one_histo {
5387 const char *name;
5388 uint64_t *count;
5389 uint64_t *len;
5390 uint64_t cumulative;
5391 } one_histo_t;
5392
5393 /*
5394 * The number of separate histograms processed for psize, lsize and asize.
5395 */
5396 #define NUM_HISTO 3
5397
5398 /*
5399 * This routine will create a fixed column size output of three different
5400 * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
5401 * the count, length and cumulative length of the psize, lsize and
5402 * asize blocks.
5403 *
5404 * All three types of blocks are listed on a single line
5405 *
5406 * By default the table is printed in nicenumber format (e.g. 123K) but
5407 * if the '-P' parameter is specified then the full raw number (parseable)
5408 * is printed out.
5409 */
5410 static void
5411 dump_size_histograms(zdb_cb_t *zcb)
5412 {
5413 /*
5414 * A temporary buffer that allows us to convert a number into
5415 * a string using zdb_nicenumber to allow either raw or human
5416 * readable numbers to be output.
5417 */
5418 char numbuf[32];
5419
5420 /*
5421 * Define titles which are used in the headers of the tables
5422 * printed by this routine.
5423 */
5424 const char blocksize_title1[] = "block";
5425 const char blocksize_title2[] = "size";
5426 const char count_title[] = "Count";
5427 const char length_title[] = "Size";
5428 const char cumulative_title[] = "Cum.";
5429
5430 /*
5431 * Setup the histogram arrays (psize, lsize, and asize).
5432 */
5433 one_histo_t parm_histo[NUM_HISTO];
5434
5435 parm_histo[0].name = "psize";
5436 parm_histo[0].count = zcb->zcb_psize_count;
5437 parm_histo[0].len = zcb->zcb_psize_len;
5438 parm_histo[0].cumulative = 0;
5439
5440 parm_histo[1].name = "lsize";
5441 parm_histo[1].count = zcb->zcb_lsize_count;
5442 parm_histo[1].len = zcb->zcb_lsize_len;
5443 parm_histo[1].cumulative = 0;
5444
5445 parm_histo[2].name = "asize";
5446 parm_histo[2].count = zcb->zcb_asize_count;
5447 parm_histo[2].len = zcb->zcb_asize_len;
5448 parm_histo[2].cumulative = 0;
5449
5450
5451 (void) printf("\nBlock Size Histogram\n");
5452 /*
5453 * Print the first line titles
5454 */
5455 if (dump_opt['P'])
5456 (void) printf("\n%s\t", blocksize_title1);
5457 else
5458 (void) printf("\n%7s ", blocksize_title1);
5459
5460 for (int j = 0; j < NUM_HISTO; j++) {
5461 if (dump_opt['P']) {
5462 if (j < NUM_HISTO - 1) {
5463 (void) printf("%s\t\t\t", parm_histo[j].name);
5464 } else {
5465 /* Don't print trailing spaces */
5466 (void) printf(" %s", parm_histo[j].name);
5467 }
5468 } else {
5469 if (j < NUM_HISTO - 1) {
5470 /* Left aligned strings in the output */
5471 (void) printf("%-7s ",
5472 parm_histo[j].name);
5473 } else {
5474 /* Don't print trailing spaces */
5475 (void) printf("%s", parm_histo[j].name);
5476 }
5477 }
5478 }
5479 (void) printf("\n");
5480
5481 /*
5482 * Print the second line titles
5483 */
5484 if (dump_opt['P']) {
5485 (void) printf("%s\t", blocksize_title2);
5486 } else {
5487 (void) printf("%7s ", blocksize_title2);
5488 }
5489
5490 for (int i = 0; i < NUM_HISTO; i++) {
5491 if (dump_opt['P']) {
5492 (void) printf("%s\t%s\t%s\t",
5493 count_title, length_title, cumulative_title);
5494 } else {
5495 (void) printf("%7s%7s%7s",
5496 count_title, length_title, cumulative_title);
5497 }
5498 }
5499 (void) printf("\n");
5500
5501 /*
5502 * Print the rows
5503 */
5504 for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
5505
5506 /*
5507 * Print the first column showing the blocksize
5508 */
5509 zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
5510
5511 if (dump_opt['P']) {
5512 printf("%s", numbuf);
5513 } else {
5514 printf("%7s:", numbuf);
5515 }
5516
5517 /*
5518 * Print the remaining set of 3 columns per size:
5519 * for psize, lsize and asize
5520 */
5521 for (int j = 0; j < NUM_HISTO; j++) {
5522 parm_histo[j].cumulative += parm_histo[j].len[i];
5523
5524 zdb_nicenum(parm_histo[j].count[i],
5525 numbuf, sizeof (numbuf));
5526 if (dump_opt['P'])
5527 (void) printf("\t%s", numbuf);
5528 else
5529 (void) printf("%7s", numbuf);
5530
5531 zdb_nicenum(parm_histo[j].len[i],
5532 numbuf, sizeof (numbuf));
5533 if (dump_opt['P'])
5534 (void) printf("\t%s", numbuf);
5535 else
5536 (void) printf("%7s", numbuf);
5537
5538 zdb_nicenum(parm_histo[j].cumulative,
5539 numbuf, sizeof (numbuf));
5540 if (dump_opt['P'])
5541 (void) printf("\t%s", numbuf);
5542 else
5543 (void) printf("%7s", numbuf);
5544 }
5545 (void) printf("\n");
5546 }
5547 }
5548
5549 static void
5550 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
5551 dmu_object_type_t type)
5552 {
5553 uint64_t refcnt = 0;
5554 int i;
5555
5556 ASSERT(type < ZDB_OT_TOTAL);
5557
5558 if (zilog && zil_bp_tree_add(zilog, bp) != 0)
5559 return;
5560
5561 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
5562
5563 for (i = 0; i < 4; i++) {
5564 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
5565 int t = (i & 1) ? type : ZDB_OT_TOTAL;
5566 int equal;
5567 zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
5568
5569 zb->zb_asize += BP_GET_ASIZE(bp);
5570 zb->zb_lsize += BP_GET_LSIZE(bp);
5571 zb->zb_psize += BP_GET_PSIZE(bp);
5572 zb->zb_count++;
5573
5574 /*
5575 * The histogram is only big enough to record blocks up to
5576 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
5577 * "other", bucket.
5578 */
5579 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
5580 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
5581 zb->zb_psize_histogram[idx]++;
5582
5583 zb->zb_gangs += BP_COUNT_GANG(bp);
5584
5585 switch (BP_GET_NDVAS(bp)) {
5586 case 2:
5587 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5588 DVA_GET_VDEV(&bp->blk_dva[1])) {
5589 zb->zb_ditto_samevdev++;
5590
5591 if (same_metaslab(zcb->zcb_spa,
5592 DVA_GET_VDEV(&bp->blk_dva[0]),
5593 DVA_GET_OFFSET(&bp->blk_dva[0]),
5594 DVA_GET_OFFSET(&bp->blk_dva[1])))
5595 zb->zb_ditto_same_ms++;
5596 }
5597 break;
5598 case 3:
5599 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5600 DVA_GET_VDEV(&bp->blk_dva[1])) +
5601 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5602 DVA_GET_VDEV(&bp->blk_dva[2])) +
5603 (DVA_GET_VDEV(&bp->blk_dva[1]) ==
5604 DVA_GET_VDEV(&bp->blk_dva[2]));
5605 if (equal != 0) {
5606 zb->zb_ditto_samevdev++;
5607
5608 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5609 DVA_GET_VDEV(&bp->blk_dva[1]) &&
5610 same_metaslab(zcb->zcb_spa,
5611 DVA_GET_VDEV(&bp->blk_dva[0]),
5612 DVA_GET_OFFSET(&bp->blk_dva[0]),
5613 DVA_GET_OFFSET(&bp->blk_dva[1])))
5614 zb->zb_ditto_same_ms++;
5615 else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5616 DVA_GET_VDEV(&bp->blk_dva[2]) &&
5617 same_metaslab(zcb->zcb_spa,
5618 DVA_GET_VDEV(&bp->blk_dva[0]),
5619 DVA_GET_OFFSET(&bp->blk_dva[0]),
5620 DVA_GET_OFFSET(&bp->blk_dva[2])))
5621 zb->zb_ditto_same_ms++;
5622 else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
5623 DVA_GET_VDEV(&bp->blk_dva[2]) &&
5624 same_metaslab(zcb->zcb_spa,
5625 DVA_GET_VDEV(&bp->blk_dva[1]),
5626 DVA_GET_OFFSET(&bp->blk_dva[1]),
5627 DVA_GET_OFFSET(&bp->blk_dva[2])))
5628 zb->zb_ditto_same_ms++;
5629 }
5630 break;
5631 }
5632 }
5633
5634 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
5635
5636 if (BP_IS_EMBEDDED(bp)) {
5637 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
5638 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
5639 [BPE_GET_PSIZE(bp)]++;
5640 return;
5641 }
5642 /*
5643 * The binning histogram bins by powers of two up to
5644 * SPA_MAXBLOCKSIZE rather than creating bins for
5645 * every possible blocksize found in the pool.
5646 */
5647 int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
5648
5649 zcb->zcb_psize_count[bin]++;
5650 zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
5651 zcb->zcb_psize_total += BP_GET_PSIZE(bp);
5652
5653 bin = highbit64(BP_GET_LSIZE(bp)) - 1;
5654
5655 zcb->zcb_lsize_count[bin]++;
5656 zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
5657 zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
5658
5659 bin = highbit64(BP_GET_ASIZE(bp)) - 1;
5660
5661 zcb->zcb_asize_count[bin]++;
5662 zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
5663 zcb->zcb_asize_total += BP_GET_ASIZE(bp);
5664
5665 if (dump_opt['L'])
5666 return;
5667
5668 if (BP_GET_DEDUP(bp)) {
5669 ddt_t *ddt;
5670 ddt_entry_t *dde;
5671
5672 ddt = ddt_select(zcb->zcb_spa, bp);
5673 ddt_enter(ddt);
5674 dde = ddt_lookup(ddt, bp, B_FALSE);
5675
5676 if (dde == NULL) {
5677 refcnt = 0;
5678 } else {
5679 ddt_phys_t *ddp = ddt_phys_select(dde, bp);
5680 ddt_phys_decref(ddp);
5681 refcnt = ddp->ddp_refcnt;
5682 if (ddt_phys_total_refcnt(dde) == 0)
5683 ddt_remove(ddt, dde);
5684 }
5685 ddt_exit(ddt);
5686 }
5687
5688 VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
5689 refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
5690 bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
5691 }
5692
5693 static void
5694 zdb_blkptr_done(zio_t *zio)
5695 {
5696 spa_t *spa = zio->io_spa;
5697 blkptr_t *bp = zio->io_bp;
5698 int ioerr = zio->io_error;
5699 zdb_cb_t *zcb = zio->io_private;
5700 zbookmark_phys_t *zb = &zio->io_bookmark;
5701
5702 mutex_enter(&spa->spa_scrub_lock);
5703 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
5704 cv_broadcast(&spa->spa_scrub_io_cv);
5705
5706 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
5707 char blkbuf[BP_SPRINTF_LEN];
5708
5709 zcb->zcb_haderrors = 1;
5710 zcb->zcb_errors[ioerr]++;
5711
5712 if (dump_opt['b'] >= 2)
5713 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5714 else
5715 blkbuf[0] = '\0';
5716
5717 (void) printf("zdb_blkptr_cb: "
5718 "Got error %d reading "
5719 "<%llu, %llu, %lld, %llx> %s -- skipping\n",
5720 ioerr,
5721 (u_longlong_t)zb->zb_objset,
5722 (u_longlong_t)zb->zb_object,
5723 (u_longlong_t)zb->zb_level,
5724 (u_longlong_t)zb->zb_blkid,
5725 blkbuf);
5726 }
5727 mutex_exit(&spa->spa_scrub_lock);
5728
5729 abd_free(zio->io_abd);
5730 }
5731
5732 static int
5733 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5734 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
5735 {
5736 zdb_cb_t *zcb = arg;
5737 dmu_object_type_t type;
5738 boolean_t is_metadata;
5739
5740 if (zb->zb_level == ZB_DNODE_LEVEL)
5741 return (0);
5742
5743 if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
5744 char blkbuf[BP_SPRINTF_LEN];
5745 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5746 (void) printf("objset %llu object %llu "
5747 "level %lld offset 0x%llx %s\n",
5748 (u_longlong_t)zb->zb_objset,
5749 (u_longlong_t)zb->zb_object,
5750 (longlong_t)zb->zb_level,
5751 (u_longlong_t)blkid2offset(dnp, bp, zb),
5752 blkbuf);
5753 }
5754
5755 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
5756 return (0);
5757
5758 type = BP_GET_TYPE(bp);
5759
5760 zdb_count_block(zcb, zilog, bp,
5761 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
5762
5763 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
5764
5765 if (!BP_IS_EMBEDDED(bp) &&
5766 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
5767 size_t size = BP_GET_PSIZE(bp);
5768 abd_t *abd = abd_alloc(size, B_FALSE);
5769 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
5770
5771 /* If it's an intent log block, failure is expected. */
5772 if (zb->zb_level == ZB_ZIL_LEVEL)
5773 flags |= ZIO_FLAG_SPECULATIVE;
5774
5775 mutex_enter(&spa->spa_scrub_lock);
5776 while (spa->spa_load_verify_bytes > max_inflight_bytes)
5777 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
5778 spa->spa_load_verify_bytes += size;
5779 mutex_exit(&spa->spa_scrub_lock);
5780
5781 zio_nowait(zio_read(NULL, spa, bp, abd, size,
5782 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
5783 }
5784
5785 zcb->zcb_readfails = 0;
5786
5787 /* only call gethrtime() every 100 blocks */
5788 static int iters;
5789 if (++iters > 100)
5790 iters = 0;
5791 else
5792 return (0);
5793
5794 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
5795 uint64_t now = gethrtime();
5796 char buf[10];
5797 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
5798 uint64_t kb_per_sec =
5799 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
5800 uint64_t sec_remaining =
5801 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
5802
5803 /* make sure nicenum has enough space */
5804 _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
5805
5806 zfs_nicebytes(bytes, buf, sizeof (buf));
5807 (void) fprintf(stderr,
5808 "\r%5s completed (%4"PRIu64"MB/s) "
5809 "estimated time remaining: "
5810 "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ",
5811 buf, kb_per_sec / 1024,
5812 sec_remaining / 60 / 60,
5813 sec_remaining / 60 % 60,
5814 sec_remaining % 60);
5815
5816 zcb->zcb_lastprint = now;
5817 }
5818
5819 return (0);
5820 }
5821
5822 static void
5823 zdb_leak(void *arg, uint64_t start, uint64_t size)
5824 {
5825 vdev_t *vd = arg;
5826
5827 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
5828 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
5829 }
5830
5831 static metaslab_ops_t zdb_metaslab_ops = {
5832 NULL /* alloc */
5833 };
5834
5835 static int
5836 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
5837 uint64_t txg, void *arg)
5838 {
5839 spa_vdev_removal_t *svr = arg;
5840
5841 uint64_t offset = sme->sme_offset;
5842 uint64_t size = sme->sme_run;
5843
5844 /* skip vdevs we don't care about */
5845 if (sme->sme_vdev != svr->svr_vdev_id)
5846 return (0);
5847
5848 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
5849 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5850 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
5851
5852 if (txg < metaslab_unflushed_txg(ms))
5853 return (0);
5854
5855 if (sme->sme_type == SM_ALLOC)
5856 range_tree_add(svr->svr_allocd_segs, offset, size);
5857 else
5858 range_tree_remove(svr->svr_allocd_segs, offset, size);
5859
5860 return (0);
5861 }
5862
5863 static void
5864 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5865 uint64_t size, void *arg)
5866 {
5867 (void) inner_offset, (void) arg;
5868
5869 /*
5870 * This callback was called through a remap from
5871 * a device being removed. Therefore, the vdev that
5872 * this callback is applied to is a concrete
5873 * vdev.
5874 */
5875 ASSERT(vdev_is_concrete(vd));
5876
5877 VERIFY0(metaslab_claim_impl(vd, offset, size,
5878 spa_min_claim_txg(vd->vdev_spa)));
5879 }
5880
5881 static void
5882 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
5883 {
5884 vdev_t *vd = arg;
5885
5886 vdev_indirect_ops.vdev_op_remap(vd, offset, size,
5887 claim_segment_impl_cb, NULL);
5888 }
5889
5890 /*
5891 * After accounting for all allocated blocks that are directly referenced,
5892 * we might have missed a reference to a block from a partially complete
5893 * (and thus unused) indirect mapping object. We perform a secondary pass
5894 * through the metaslabs we have already mapped and claim the destination
5895 * blocks.
5896 */
5897 static void
5898 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
5899 {
5900 if (dump_opt['L'])
5901 return;
5902
5903 if (spa->spa_vdev_removal == NULL)
5904 return;
5905
5906 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5907
5908 spa_vdev_removal_t *svr = spa->spa_vdev_removal;
5909 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
5910 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
5911
5912 ASSERT0(range_tree_space(svr->svr_allocd_segs));
5913
5914 range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
5915 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
5916 metaslab_t *msp = vd->vdev_ms[msi];
5917
5918 ASSERT0(range_tree_space(allocs));
5919 if (msp->ms_sm != NULL)
5920 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
5921 range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
5922 }
5923 range_tree_destroy(allocs);
5924
5925 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
5926
5927 /*
5928 * Clear everything past what has been synced,
5929 * because we have not allocated mappings for
5930 * it yet.
5931 */
5932 range_tree_clear(svr->svr_allocd_segs,
5933 vdev_indirect_mapping_max_offset(vim),
5934 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
5935
5936 zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
5937 range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
5938
5939 spa_config_exit(spa, SCL_CONFIG, FTAG);
5940 }
5941
5942 static int
5943 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
5944 dmu_tx_t *tx)
5945 {
5946 (void) tx;
5947 zdb_cb_t *zcb = arg;
5948 spa_t *spa = zcb->zcb_spa;
5949 vdev_t *vd;
5950 const dva_t *dva = &bp->blk_dva[0];
5951
5952 ASSERT(!bp_freed);
5953 ASSERT(!dump_opt['L']);
5954 ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
5955
5956 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5957 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
5958 ASSERT3P(vd, !=, NULL);
5959 spa_config_exit(spa, SCL_VDEV, FTAG);
5960
5961 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
5962 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
5963
5964 vdev_indirect_mapping_increment_obsolete_count(
5965 vd->vdev_indirect_mapping,
5966 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
5967 zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
5968
5969 return (0);
5970 }
5971
5972 static uint32_t *
5973 zdb_load_obsolete_counts(vdev_t *vd)
5974 {
5975 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
5976 spa_t *spa = vd->vdev_spa;
5977 spa_condensing_indirect_phys_t *scip =
5978 &spa->spa_condensing_indirect_phys;
5979 uint64_t obsolete_sm_object;
5980 uint32_t *counts;
5981
5982 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
5983 EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
5984 counts = vdev_indirect_mapping_load_obsolete_counts(vim);
5985 if (vd->vdev_obsolete_sm != NULL) {
5986 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
5987 vd->vdev_obsolete_sm);
5988 }
5989 if (scip->scip_vdev == vd->vdev_id &&
5990 scip->scip_prev_obsolete_sm_object != 0) {
5991 space_map_t *prev_obsolete_sm = NULL;
5992 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
5993 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
5994 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
5995 prev_obsolete_sm);
5996 space_map_close(prev_obsolete_sm);
5997 }
5998 return (counts);
5999 }
6000
6001 static void
6002 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
6003 {
6004 ddt_bookmark_t ddb = {0};
6005 ddt_entry_t dde;
6006 int error;
6007 int p;
6008
6009 ASSERT(!dump_opt['L']);
6010
6011 while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
6012 blkptr_t blk;
6013 ddt_phys_t *ddp = dde.dde_phys;
6014
6015 if (ddb.ddb_class == DDT_CLASS_UNIQUE)
6016 return;
6017
6018 ASSERT(ddt_phys_total_refcnt(&dde) > 1);
6019
6020 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
6021 if (ddp->ddp_phys_birth == 0)
6022 continue;
6023 ddt_bp_create(ddb.ddb_checksum,
6024 &dde.dde_key, ddp, &blk);
6025 if (p == DDT_PHYS_DITTO) {
6026 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
6027 } else {
6028 zcb->zcb_dedup_asize +=
6029 BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
6030 zcb->zcb_dedup_blocks++;
6031 }
6032 }
6033 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
6034 ddt_enter(ddt);
6035 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
6036 ddt_exit(ddt);
6037 }
6038
6039 ASSERT(error == ENOENT);
6040 }
6041
6042 typedef struct checkpoint_sm_exclude_entry_arg {
6043 vdev_t *cseea_vd;
6044 uint64_t cseea_checkpoint_size;
6045 } checkpoint_sm_exclude_entry_arg_t;
6046
6047 static int
6048 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
6049 {
6050 checkpoint_sm_exclude_entry_arg_t *cseea = arg;
6051 vdev_t *vd = cseea->cseea_vd;
6052 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
6053 uint64_t end = sme->sme_offset + sme->sme_run;
6054
6055 ASSERT(sme->sme_type == SM_FREE);
6056
6057 /*
6058 * Since the vdev_checkpoint_sm exists in the vdev level
6059 * and the ms_sm space maps exist in the metaslab level,
6060 * an entry in the checkpoint space map could theoretically
6061 * cross the boundaries of the metaslab that it belongs.
6062 *
6063 * In reality, because of the way that we populate and
6064 * manipulate the checkpoint's space maps currently,
6065 * there shouldn't be any entries that cross metaslabs.
6066 * Hence the assertion below.
6067 *
6068 * That said, there is no fundamental requirement that
6069 * the checkpoint's space map entries should not cross
6070 * metaslab boundaries. So if needed we could add code
6071 * that handles metaslab-crossing segments in the future.
6072 */
6073 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
6074 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
6075
6076 /*
6077 * By removing the entry from the allocated segments we
6078 * also verify that the entry is there to begin with.
6079 */
6080 mutex_enter(&ms->ms_lock);
6081 range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
6082 mutex_exit(&ms->ms_lock);
6083
6084 cseea->cseea_checkpoint_size += sme->sme_run;
6085 return (0);
6086 }
6087
6088 static void
6089 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
6090 {
6091 spa_t *spa = vd->vdev_spa;
6092 space_map_t *checkpoint_sm = NULL;
6093 uint64_t checkpoint_sm_obj;
6094
6095 /*
6096 * If there is no vdev_top_zap, we are in a pool whose
6097 * version predates the pool checkpoint feature.
6098 */
6099 if (vd->vdev_top_zap == 0)
6100 return;
6101
6102 /*
6103 * If there is no reference of the vdev_checkpoint_sm in
6104 * the vdev_top_zap, then one of the following scenarios
6105 * is true:
6106 *
6107 * 1] There is no checkpoint
6108 * 2] There is a checkpoint, but no checkpointed blocks
6109 * have been freed yet
6110 * 3] The current vdev is indirect
6111 *
6112 * In these cases we return immediately.
6113 */
6114 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
6115 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
6116 return;
6117
6118 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
6119 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
6120 &checkpoint_sm_obj));
6121
6122 checkpoint_sm_exclude_entry_arg_t cseea;
6123 cseea.cseea_vd = vd;
6124 cseea.cseea_checkpoint_size = 0;
6125
6126 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
6127 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
6128
6129 VERIFY0(space_map_iterate(checkpoint_sm,
6130 space_map_length(checkpoint_sm),
6131 checkpoint_sm_exclude_entry_cb, &cseea));
6132 space_map_close(checkpoint_sm);
6133
6134 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
6135 }
6136
6137 static void
6138 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
6139 {
6140 ASSERT(!dump_opt['L']);
6141
6142 vdev_t *rvd = spa->spa_root_vdev;
6143 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
6144 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
6145 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
6146 }
6147 }
6148
6149 static int
6150 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
6151 uint64_t txg, void *arg)
6152 {
6153 int64_t *ualloc_space = arg;
6154
6155 uint64_t offset = sme->sme_offset;
6156 uint64_t vdev_id = sme->sme_vdev;
6157
6158 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6159 if (!vdev_is_concrete(vd))
6160 return (0);
6161
6162 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6163 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6164
6165 if (txg < metaslab_unflushed_txg(ms))
6166 return (0);
6167
6168 if (sme->sme_type == SM_ALLOC)
6169 *ualloc_space += sme->sme_run;
6170 else
6171 *ualloc_space -= sme->sme_run;
6172
6173 return (0);
6174 }
6175
6176 static int64_t
6177 get_unflushed_alloc_space(spa_t *spa)
6178 {
6179 if (dump_opt['L'])
6180 return (0);
6181
6182 int64_t ualloc_space = 0;
6183 iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
6184 &ualloc_space);
6185 return (ualloc_space);
6186 }
6187
6188 static int
6189 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
6190 {
6191 maptype_t *uic_maptype = arg;
6192
6193 uint64_t offset = sme->sme_offset;
6194 uint64_t size = sme->sme_run;
6195 uint64_t vdev_id = sme->sme_vdev;
6196
6197 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6198
6199 /* skip indirect vdevs */
6200 if (!vdev_is_concrete(vd))
6201 return (0);
6202
6203 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6204
6205 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6206 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
6207
6208 if (txg < metaslab_unflushed_txg(ms))
6209 return (0);
6210
6211 if (*uic_maptype == sme->sme_type)
6212 range_tree_add(ms->ms_allocatable, offset, size);
6213 else
6214 range_tree_remove(ms->ms_allocatable, offset, size);
6215
6216 return (0);
6217 }
6218
6219 static void
6220 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
6221 {
6222 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
6223 }
6224
6225 static void
6226 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
6227 {
6228 vdev_t *rvd = spa->spa_root_vdev;
6229 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
6230 vdev_t *vd = rvd->vdev_child[i];
6231
6232 ASSERT3U(i, ==, vd->vdev_id);
6233
6234 if (vd->vdev_ops == &vdev_indirect_ops)
6235 continue;
6236
6237 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6238 metaslab_t *msp = vd->vdev_ms[m];
6239
6240 (void) fprintf(stderr,
6241 "\rloading concrete vdev %llu, "
6242 "metaslab %llu of %llu ...",
6243 (longlong_t)vd->vdev_id,
6244 (longlong_t)msp->ms_id,
6245 (longlong_t)vd->vdev_ms_count);
6246
6247 mutex_enter(&msp->ms_lock);
6248 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
6249
6250 /*
6251 * We don't want to spend the CPU manipulating the
6252 * size-ordered tree, so clear the range_tree ops.
6253 */
6254 msp->ms_allocatable->rt_ops = NULL;
6255
6256 if (msp->ms_sm != NULL) {
6257 VERIFY0(space_map_load(msp->ms_sm,
6258 msp->ms_allocatable, maptype));
6259 }
6260 if (!msp->ms_loaded)
6261 msp->ms_loaded = B_TRUE;
6262 mutex_exit(&msp->ms_lock);
6263 }
6264 }
6265
6266 load_unflushed_to_ms_allocatables(spa, maptype);
6267 }
6268
6269 /*
6270 * vm_idxp is an in-out parameter which (for indirect vdevs) is the
6271 * index in vim_entries that has the first entry in this metaslab.
6272 * On return, it will be set to the first entry after this metaslab.
6273 */
6274 static void
6275 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
6276 uint64_t *vim_idxp)
6277 {
6278 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6279
6280 mutex_enter(&msp->ms_lock);
6281 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
6282
6283 /*
6284 * We don't want to spend the CPU manipulating the
6285 * size-ordered tree, so clear the range_tree ops.
6286 */
6287 msp->ms_allocatable->rt_ops = NULL;
6288
6289 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
6290 (*vim_idxp)++) {
6291 vdev_indirect_mapping_entry_phys_t *vimep =
6292 &vim->vim_entries[*vim_idxp];
6293 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
6294 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
6295 ASSERT3U(ent_offset, >=, msp->ms_start);
6296 if (ent_offset >= msp->ms_start + msp->ms_size)
6297 break;
6298
6299 /*
6300 * Mappings do not cross metaslab boundaries,
6301 * because we create them by walking the metaslabs.
6302 */
6303 ASSERT3U(ent_offset + ent_len, <=,
6304 msp->ms_start + msp->ms_size);
6305 range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
6306 }
6307
6308 if (!msp->ms_loaded)
6309 msp->ms_loaded = B_TRUE;
6310 mutex_exit(&msp->ms_lock);
6311 }
6312
6313 static void
6314 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
6315 {
6316 ASSERT(!dump_opt['L']);
6317
6318 vdev_t *rvd = spa->spa_root_vdev;
6319 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
6320 vdev_t *vd = rvd->vdev_child[c];
6321
6322 ASSERT3U(c, ==, vd->vdev_id);
6323
6324 if (vd->vdev_ops != &vdev_indirect_ops)
6325 continue;
6326
6327 /*
6328 * Note: we don't check for mapping leaks on
6329 * removing vdevs because their ms_allocatable's
6330 * are used to look for leaks in allocated space.
6331 */
6332 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
6333
6334 /*
6335 * Normally, indirect vdevs don't have any
6336 * metaslabs. We want to set them up for
6337 * zio_claim().
6338 */
6339 vdev_metaslab_group_create(vd);
6340 VERIFY0(vdev_metaslab_init(vd, 0));
6341
6342 vdev_indirect_mapping_t *vim __maybe_unused =
6343 vd->vdev_indirect_mapping;
6344 uint64_t vim_idx = 0;
6345 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6346
6347 (void) fprintf(stderr,
6348 "\rloading indirect vdev %llu, "
6349 "metaslab %llu of %llu ...",
6350 (longlong_t)vd->vdev_id,
6351 (longlong_t)vd->vdev_ms[m]->ms_id,
6352 (longlong_t)vd->vdev_ms_count);
6353
6354 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
6355 &vim_idx);
6356 }
6357 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
6358 }
6359 }
6360
6361 static void
6362 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
6363 {
6364 zcb->zcb_spa = spa;
6365
6366 if (dump_opt['L'])
6367 return;
6368
6369 dsl_pool_t *dp = spa->spa_dsl_pool;
6370 vdev_t *rvd = spa->spa_root_vdev;
6371
6372 /*
6373 * We are going to be changing the meaning of the metaslab's
6374 * ms_allocatable. Ensure that the allocator doesn't try to
6375 * use the tree.
6376 */
6377 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
6378 spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
6379 spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
6380
6381 zcb->zcb_vd_obsolete_counts =
6382 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
6383 UMEM_NOFAIL);
6384
6385 /*
6386 * For leak detection, we overload the ms_allocatable trees
6387 * to contain allocated segments instead of free segments.
6388 * As a result, we can't use the normal metaslab_load/unload
6389 * interfaces.
6390 */
6391 zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
6392 load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
6393
6394 /*
6395 * On load_concrete_ms_allocatable_trees() we loaded all the
6396 * allocated entries from the ms_sm to the ms_allocatable for
6397 * each metaslab. If the pool has a checkpoint or is in the
6398 * middle of discarding a checkpoint, some of these blocks
6399 * may have been freed but their ms_sm may not have been
6400 * updated because they are referenced by the checkpoint. In
6401 * order to avoid false-positives during leak-detection, we
6402 * go through the vdev's checkpoint space map and exclude all
6403 * its entries from their relevant ms_allocatable.
6404 *
6405 * We also aggregate the space held by the checkpoint and add
6406 * it to zcb_checkpoint_size.
6407 *
6408 * Note that at this point we are also verifying that all the
6409 * entries on the checkpoint_sm are marked as allocated in
6410 * the ms_sm of their relevant metaslab.
6411 * [see comment in checkpoint_sm_exclude_entry_cb()]
6412 */
6413 zdb_leak_init_exclude_checkpoint(spa, zcb);
6414 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
6415
6416 /* for cleaner progress output */
6417 (void) fprintf(stderr, "\n");
6418
6419 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
6420 ASSERT(spa_feature_is_enabled(spa,
6421 SPA_FEATURE_DEVICE_REMOVAL));
6422 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
6423 increment_indirect_mapping_cb, zcb, NULL);
6424 }
6425
6426 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6427 zdb_ddt_leak_init(spa, zcb);
6428 spa_config_exit(spa, SCL_CONFIG, FTAG);
6429 }
6430
6431 static boolean_t
6432 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
6433 {
6434 boolean_t leaks = B_FALSE;
6435 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6436 uint64_t total_leaked = 0;
6437 boolean_t are_precise = B_FALSE;
6438
6439 ASSERT(vim != NULL);
6440
6441 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
6442 vdev_indirect_mapping_entry_phys_t *vimep =
6443 &vim->vim_entries[i];
6444 uint64_t obsolete_bytes = 0;
6445 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
6446 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6447
6448 /*
6449 * This is not very efficient but it's easy to
6450 * verify correctness.
6451 */
6452 for (uint64_t inner_offset = 0;
6453 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
6454 inner_offset += 1ULL << vd->vdev_ashift) {
6455 if (range_tree_contains(msp->ms_allocatable,
6456 offset + inner_offset, 1ULL << vd->vdev_ashift)) {
6457 obsolete_bytes += 1ULL << vd->vdev_ashift;
6458 }
6459 }
6460
6461 int64_t bytes_leaked = obsolete_bytes -
6462 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
6463 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
6464 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
6465
6466 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
6467 if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
6468 (void) printf("obsolete indirect mapping count "
6469 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
6470 (u_longlong_t)vd->vdev_id,
6471 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
6472 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
6473 (u_longlong_t)bytes_leaked);
6474 }
6475 total_leaked += ABS(bytes_leaked);
6476 }
6477
6478 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
6479 if (!are_precise && total_leaked > 0) {
6480 int pct_leaked = total_leaked * 100 /
6481 vdev_indirect_mapping_bytes_mapped(vim);
6482 (void) printf("cannot verify obsolete indirect mapping "
6483 "counts of vdev %llu because precise feature was not "
6484 "enabled when it was removed: %d%% (%llx bytes) of mapping"
6485 "unreferenced\n",
6486 (u_longlong_t)vd->vdev_id, pct_leaked,
6487 (u_longlong_t)total_leaked);
6488 } else if (total_leaked > 0) {
6489 (void) printf("obsolete indirect mapping count mismatch "
6490 "for vdev %llu -- %llx total bytes mismatched\n",
6491 (u_longlong_t)vd->vdev_id,
6492 (u_longlong_t)total_leaked);
6493 leaks |= B_TRUE;
6494 }
6495
6496 vdev_indirect_mapping_free_obsolete_counts(vim,
6497 zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
6498 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
6499
6500 return (leaks);
6501 }
6502
6503 static boolean_t
6504 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
6505 {
6506 if (dump_opt['L'])
6507 return (B_FALSE);
6508
6509 boolean_t leaks = B_FALSE;
6510 vdev_t *rvd = spa->spa_root_vdev;
6511 for (unsigned c = 0; c < rvd->vdev_children; c++) {
6512 vdev_t *vd = rvd->vdev_child[c];
6513
6514 if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
6515 leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
6516 }
6517
6518 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6519 metaslab_t *msp = vd->vdev_ms[m];
6520 ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
6521 spa_embedded_log_class(spa)) ?
6522 vd->vdev_log_mg : vd->vdev_mg);
6523
6524 /*
6525 * ms_allocatable has been overloaded
6526 * to contain allocated segments. Now that
6527 * we finished traversing all blocks, any
6528 * block that remains in the ms_allocatable
6529 * represents an allocated block that we
6530 * did not claim during the traversal.
6531 * Claimed blocks would have been removed
6532 * from the ms_allocatable. For indirect
6533 * vdevs, space remaining in the tree
6534 * represents parts of the mapping that are
6535 * not referenced, which is not a bug.
6536 */
6537 if (vd->vdev_ops == &vdev_indirect_ops) {
6538 range_tree_vacate(msp->ms_allocatable,
6539 NULL, NULL);
6540 } else {
6541 range_tree_vacate(msp->ms_allocatable,
6542 zdb_leak, vd);
6543 }
6544 if (msp->ms_loaded) {
6545 msp->ms_loaded = B_FALSE;
6546 }
6547 }
6548 }
6549
6550 umem_free(zcb->zcb_vd_obsolete_counts,
6551 rvd->vdev_children * sizeof (uint32_t *));
6552 zcb->zcb_vd_obsolete_counts = NULL;
6553
6554 return (leaks);
6555 }
6556
6557 static int
6558 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6559 {
6560 (void) tx;
6561 zdb_cb_t *zcb = arg;
6562
6563 if (dump_opt['b'] >= 5) {
6564 char blkbuf[BP_SPRINTF_LEN];
6565 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
6566 (void) printf("[%s] %s\n",
6567 "deferred free", blkbuf);
6568 }
6569 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
6570 return (0);
6571 }
6572
6573 /*
6574 * Iterate over livelists which have been destroyed by the user but
6575 * are still present in the MOS, waiting to be freed
6576 */
6577 static void
6578 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
6579 {
6580 objset_t *mos = spa->spa_meta_objset;
6581 uint64_t zap_obj;
6582 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
6583 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
6584 if (err == ENOENT)
6585 return;
6586 ASSERT0(err);
6587
6588 zap_cursor_t zc;
6589 zap_attribute_t attr;
6590 dsl_deadlist_t ll;
6591 /* NULL out os prior to dsl_deadlist_open in case it's garbage */
6592 ll.dl_os = NULL;
6593 for (zap_cursor_init(&zc, mos, zap_obj);
6594 zap_cursor_retrieve(&zc, &attr) == 0;
6595 (void) zap_cursor_advance(&zc)) {
6596 dsl_deadlist_open(&ll, mos, attr.za_first_integer);
6597 func(&ll, arg);
6598 dsl_deadlist_close(&ll);
6599 }
6600 zap_cursor_fini(&zc);
6601 }
6602
6603 static int
6604 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
6605 dmu_tx_t *tx)
6606 {
6607 ASSERT(!bp_freed);
6608 return (count_block_cb(arg, bp, tx));
6609 }
6610
6611 static int
6612 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
6613 {
6614 zdb_cb_t *zbc = args;
6615 bplist_t blks;
6616 bplist_create(&blks);
6617 /* determine which blocks have been alloc'd but not freed */
6618 VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
6619 /* count those blocks */
6620 (void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
6621 bplist_destroy(&blks);
6622 return (0);
6623 }
6624
6625 static void
6626 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
6627 {
6628 dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
6629 }
6630
6631 /*
6632 * Count the blocks in the livelists that have been destroyed by the user
6633 * but haven't yet been freed.
6634 */
6635 static void
6636 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
6637 {
6638 iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
6639 }
6640
6641 static void
6642 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
6643 {
6644 ASSERT3P(arg, ==, NULL);
6645 global_feature_count[SPA_FEATURE_LIVELIST]++;
6646 dump_blkptr_list(ll, "Deleted Livelist");
6647 dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
6648 }
6649
6650 /*
6651 * Print out, register object references to, and increment feature counts for
6652 * livelists that have been destroyed by the user but haven't yet been freed.
6653 */
6654 static void
6655 deleted_livelists_dump_mos(spa_t *spa)
6656 {
6657 uint64_t zap_obj;
6658 objset_t *mos = spa->spa_meta_objset;
6659 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
6660 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
6661 if (err == ENOENT)
6662 return;
6663 mos_obj_refd(zap_obj);
6664 iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
6665 }
6666
6667 static int
6668 dump_block_stats(spa_t *spa)
6669 {
6670 zdb_cb_t *zcb;
6671 zdb_blkstats_t *zb, *tzb;
6672 uint64_t norm_alloc, norm_space, total_alloc, total_found;
6673 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
6674 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
6675 boolean_t leaks = B_FALSE;
6676 int e, c, err;
6677 bp_embedded_type_t i;
6678
6679 zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
6680
6681 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
6682 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
6683 (dump_opt['c'] == 1) ? "metadata " : "",
6684 dump_opt['c'] ? "checksums " : "",
6685 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
6686 !dump_opt['L'] ? "nothing leaked " : "");
6687
6688 /*
6689 * When leak detection is enabled we load all space maps as SM_ALLOC
6690 * maps, then traverse the pool claiming each block we discover. If
6691 * the pool is perfectly consistent, the segment trees will be empty
6692 * when we're done. Anything left over is a leak; any block we can't
6693 * claim (because it's not part of any space map) is a double
6694 * allocation, reference to a freed block, or an unclaimed log block.
6695 *
6696 * When leak detection is disabled (-L option) we still traverse the
6697 * pool claiming each block we discover, but we skip opening any space
6698 * maps.
6699 */
6700 zdb_leak_init(spa, zcb);
6701
6702 /*
6703 * If there's a deferred-free bplist, process that first.
6704 */
6705 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
6706 bpobj_count_block_cb, zcb, NULL);
6707
6708 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
6709 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
6710 bpobj_count_block_cb, zcb, NULL);
6711 }
6712
6713 zdb_claim_removing(spa, zcb);
6714
6715 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
6716 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
6717 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
6718 zcb, NULL));
6719 }
6720
6721 deleted_livelists_count_blocks(spa, zcb);
6722
6723 if (dump_opt['c'] > 1)
6724 flags |= TRAVERSE_PREFETCH_DATA;
6725
6726 zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
6727 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
6728 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
6729 zcb->zcb_totalasize +=
6730 metaslab_class_get_alloc(spa_embedded_log_class(spa));
6731 zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
6732 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
6733
6734 /*
6735 * If we've traversed the data blocks then we need to wait for those
6736 * I/Os to complete. We leverage "The Godfather" zio to wait on
6737 * all async I/Os to complete.
6738 */
6739 if (dump_opt['c']) {
6740 for (c = 0; c < max_ncpus; c++) {
6741 (void) zio_wait(spa->spa_async_zio_root[c]);
6742 spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
6743 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
6744 ZIO_FLAG_GODFATHER);
6745 }
6746 }
6747 ASSERT0(spa->spa_load_verify_bytes);
6748
6749 /*
6750 * Done after zio_wait() since zcb_haderrors is modified in
6751 * zdb_blkptr_done()
6752 */
6753 zcb->zcb_haderrors |= err;
6754
6755 if (zcb->zcb_haderrors) {
6756 (void) printf("\nError counts:\n\n");
6757 (void) printf("\t%5s %s\n", "errno", "count");
6758 for (e = 0; e < 256; e++) {
6759 if (zcb->zcb_errors[e] != 0) {
6760 (void) printf("\t%5d %llu\n",
6761 e, (u_longlong_t)zcb->zcb_errors[e]);
6762 }
6763 }
6764 }
6765
6766 /*
6767 * Report any leaked segments.
6768 */
6769 leaks |= zdb_leak_fini(spa, zcb);
6770
6771 tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
6772
6773 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
6774 norm_space = metaslab_class_get_space(spa_normal_class(spa));
6775
6776 total_alloc = norm_alloc +
6777 metaslab_class_get_alloc(spa_log_class(spa)) +
6778 metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
6779 metaslab_class_get_alloc(spa_special_class(spa)) +
6780 metaslab_class_get_alloc(spa_dedup_class(spa)) +
6781 get_unflushed_alloc_space(spa);
6782 total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
6783 zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
6784
6785 if (total_found == total_alloc && !dump_opt['L']) {
6786 (void) printf("\n\tNo leaks (block sum matches space"
6787 " maps exactly)\n");
6788 } else if (!dump_opt['L']) {
6789 (void) printf("block traversal size %llu != alloc %llu "
6790 "(%s %lld)\n",
6791 (u_longlong_t)total_found,
6792 (u_longlong_t)total_alloc,
6793 (dump_opt['L']) ? "unreachable" : "leaked",
6794 (longlong_t)(total_alloc - total_found));
6795 leaks = B_TRUE;
6796 }
6797
6798 if (tzb->zb_count == 0) {
6799 umem_free(zcb, sizeof (zdb_cb_t));
6800 return (2);
6801 }
6802
6803 (void) printf("\n");
6804 (void) printf("\t%-16s %14llu\n", "bp count:",
6805 (u_longlong_t)tzb->zb_count);
6806 (void) printf("\t%-16s %14llu\n", "ganged count:",
6807 (longlong_t)tzb->zb_gangs);
6808 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
6809 (u_longlong_t)tzb->zb_lsize,
6810 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
6811 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
6812 "bp physical:", (u_longlong_t)tzb->zb_psize,
6813 (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
6814 (double)tzb->zb_lsize / tzb->zb_psize);
6815 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
6816 "bp allocated:", (u_longlong_t)tzb->zb_asize,
6817 (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
6818 (double)tzb->zb_lsize / tzb->zb_asize);
6819 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
6820 "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
6821 (u_longlong_t)zcb->zcb_dedup_blocks,
6822 (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
6823 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
6824 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
6825
6826 if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6827 uint64_t alloc = metaslab_class_get_alloc(
6828 spa_special_class(spa));
6829 uint64_t space = metaslab_class_get_space(
6830 spa_special_class(spa));
6831
6832 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
6833 "Special class", (u_longlong_t)alloc,
6834 100.0 * alloc / space);
6835 }
6836
6837 if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6838 uint64_t alloc = metaslab_class_get_alloc(
6839 spa_dedup_class(spa));
6840 uint64_t space = metaslab_class_get_space(
6841 spa_dedup_class(spa));
6842
6843 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
6844 "Dedup class", (u_longlong_t)alloc,
6845 100.0 * alloc / space);
6846 }
6847
6848 if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6849 uint64_t alloc = metaslab_class_get_alloc(
6850 spa_embedded_log_class(spa));
6851 uint64_t space = metaslab_class_get_space(
6852 spa_embedded_log_class(spa));
6853
6854 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
6855 "Embedded log class", (u_longlong_t)alloc,
6856 100.0 * alloc / space);
6857 }
6858
6859 for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
6860 if (zcb->zcb_embedded_blocks[i] == 0)
6861 continue;
6862 (void) printf("\n");
6863 (void) printf("\tadditional, non-pointer bps of type %u: "
6864 "%10llu\n",
6865 i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
6866
6867 if (dump_opt['b'] >= 3) {
6868 (void) printf("\t number of (compressed) bytes: "
6869 "number of bps\n");
6870 dump_histogram(zcb->zcb_embedded_histogram[i],
6871 sizeof (zcb->zcb_embedded_histogram[i]) /
6872 sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
6873 }
6874 }
6875
6876 if (tzb->zb_ditto_samevdev != 0) {
6877 (void) printf("\tDittoed blocks on same vdev: %llu\n",
6878 (longlong_t)tzb->zb_ditto_samevdev);
6879 }
6880 if (tzb->zb_ditto_same_ms != 0) {
6881 (void) printf("\tDittoed blocks in same metaslab: %llu\n",
6882 (longlong_t)tzb->zb_ditto_same_ms);
6883 }
6884
6885 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
6886 vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
6887 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6888
6889 if (vim == NULL) {
6890 continue;
6891 }
6892
6893 char mem[32];
6894 zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
6895 mem, vdev_indirect_mapping_size(vim));
6896
6897 (void) printf("\tindirect vdev id %llu has %llu segments "
6898 "(%s in memory)\n",
6899 (longlong_t)vd->vdev_id,
6900 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
6901 }
6902
6903 if (dump_opt['b'] >= 2) {
6904 int l, t, level;
6905 char csize[32], lsize[32], psize[32], asize[32];
6906 char avg[32], gang[32];
6907 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
6908 "\t avg\t comp\t%%Total\tType\n");
6909
6910 zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
6911 UMEM_NOFAIL);
6912
6913 for (t = 0; t <= ZDB_OT_TOTAL; t++) {
6914 const char *typename;
6915
6916 /* make sure nicenum has enough space */
6917 _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
6918 "csize truncated");
6919 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
6920 "lsize truncated");
6921 _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
6922 "psize truncated");
6923 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
6924 "asize truncated");
6925 _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
6926 "avg truncated");
6927 _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
6928 "gang truncated");
6929
6930 if (t < DMU_OT_NUMTYPES)
6931 typename = dmu_ot[t].ot_name;
6932 else
6933 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
6934
6935 if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
6936 (void) printf("%6s\t%5s\t%5s\t%5s"
6937 "\t%5s\t%5s\t%6s\t%s\n",
6938 "-",
6939 "-",
6940 "-",
6941 "-",
6942 "-",
6943 "-",
6944 "-",
6945 typename);
6946 continue;
6947 }
6948
6949 for (l = ZB_TOTAL - 1; l >= -1; l--) {
6950 level = (l == -1 ? ZB_TOTAL : l);
6951 zb = &zcb->zcb_type[level][t];
6952
6953 if (zb->zb_asize == 0)
6954 continue;
6955
6956 if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
6957 (level > 0 || DMU_OT_IS_METADATA(t))) {
6958 mdstats->zb_count += zb->zb_count;
6959 mdstats->zb_lsize += zb->zb_lsize;
6960 mdstats->zb_psize += zb->zb_psize;
6961 mdstats->zb_asize += zb->zb_asize;
6962 mdstats->zb_gangs += zb->zb_gangs;
6963 }
6964
6965 if (dump_opt['b'] < 3 && level != ZB_TOTAL)
6966 continue;
6967
6968 if (level == 0 && zb->zb_asize ==
6969 zcb->zcb_type[ZB_TOTAL][t].zb_asize)
6970 continue;
6971
6972 zdb_nicenum(zb->zb_count, csize,
6973 sizeof (csize));
6974 zdb_nicenum(zb->zb_lsize, lsize,
6975 sizeof (lsize));
6976 zdb_nicenum(zb->zb_psize, psize,
6977 sizeof (psize));
6978 zdb_nicenum(zb->zb_asize, asize,
6979 sizeof (asize));
6980 zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
6981 sizeof (avg));
6982 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
6983
6984 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
6985 "\t%5.2f\t%6.2f\t",
6986 csize, lsize, psize, asize, avg,
6987 (double)zb->zb_lsize / zb->zb_psize,
6988 100.0 * zb->zb_asize / tzb->zb_asize);
6989
6990 if (level == ZB_TOTAL)
6991 (void) printf("%s\n", typename);
6992 else
6993 (void) printf(" L%d %s\n",
6994 level, typename);
6995
6996 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
6997 (void) printf("\t number of ganged "
6998 "blocks: %s\n", gang);
6999 }
7000
7001 if (dump_opt['b'] >= 4) {
7002 (void) printf("psize "
7003 "(in 512-byte sectors): "
7004 "number of blocks\n");
7005 dump_histogram(zb->zb_psize_histogram,
7006 PSIZE_HISTO_SIZE, 0);
7007 }
7008 }
7009 }
7010 zdb_nicenum(mdstats->zb_count, csize,
7011 sizeof (csize));
7012 zdb_nicenum(mdstats->zb_lsize, lsize,
7013 sizeof (lsize));
7014 zdb_nicenum(mdstats->zb_psize, psize,
7015 sizeof (psize));
7016 zdb_nicenum(mdstats->zb_asize, asize,
7017 sizeof (asize));
7018 zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
7019 sizeof (avg));
7020 zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
7021
7022 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
7023 "\t%5.2f\t%6.2f\t",
7024 csize, lsize, psize, asize, avg,
7025 (double)mdstats->zb_lsize / mdstats->zb_psize,
7026 100.0 * mdstats->zb_asize / tzb->zb_asize);
7027 (void) printf("%s\n", "Metadata Total");
7028
7029 /* Output a table summarizing block sizes in the pool */
7030 if (dump_opt['b'] >= 2) {
7031 dump_size_histograms(zcb);
7032 }
7033
7034 umem_free(mdstats, sizeof (zfs_blkstat_t));
7035 }
7036
7037 (void) printf("\n");
7038
7039 if (leaks) {
7040 umem_free(zcb, sizeof (zdb_cb_t));
7041 return (2);
7042 }
7043
7044 if (zcb->zcb_haderrors) {
7045 umem_free(zcb, sizeof (zdb_cb_t));
7046 return (3);
7047 }
7048
7049 umem_free(zcb, sizeof (zdb_cb_t));
7050 return (0);
7051 }
7052
7053 typedef struct zdb_ddt_entry {
7054 ddt_key_t zdde_key;
7055 uint64_t zdde_ref_blocks;
7056 uint64_t zdde_ref_lsize;
7057 uint64_t zdde_ref_psize;
7058 uint64_t zdde_ref_dsize;
7059 avl_node_t zdde_node;
7060 } zdb_ddt_entry_t;
7061
7062 static int
7063 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
7064 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
7065 {
7066 (void) zilog, (void) dnp;
7067 avl_tree_t *t = arg;
7068 avl_index_t where;
7069 zdb_ddt_entry_t *zdde, zdde_search;
7070
7071 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
7072 BP_IS_EMBEDDED(bp))
7073 return (0);
7074
7075 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
7076 (void) printf("traversing objset %llu, %llu objects, "
7077 "%lu blocks so far\n",
7078 (u_longlong_t)zb->zb_objset,
7079 (u_longlong_t)BP_GET_FILL(bp),
7080 avl_numnodes(t));
7081 }
7082
7083 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
7084 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
7085 return (0);
7086
7087 ddt_key_fill(&zdde_search.zdde_key, bp);
7088
7089 zdde = avl_find(t, &zdde_search, &where);
7090
7091 if (zdde == NULL) {
7092 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
7093 zdde->zdde_key = zdde_search.zdde_key;
7094 avl_insert(t, zdde, where);
7095 }
7096
7097 zdde->zdde_ref_blocks += 1;
7098 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
7099 zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
7100 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
7101
7102 return (0);
7103 }
7104
7105 static void
7106 dump_simulated_ddt(spa_t *spa)
7107 {
7108 avl_tree_t t;
7109 void *cookie = NULL;
7110 zdb_ddt_entry_t *zdde;
7111 ddt_histogram_t ddh_total = {{{0}}};
7112 ddt_stat_t dds_total = {0};
7113
7114 avl_create(&t, ddt_entry_compare,
7115 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
7116
7117 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7118
7119 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
7120 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
7121
7122 spa_config_exit(spa, SCL_CONFIG, FTAG);
7123
7124 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
7125 ddt_stat_t dds;
7126 uint64_t refcnt = zdde->zdde_ref_blocks;
7127 ASSERT(refcnt != 0);
7128
7129 dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
7130 dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
7131 dds.dds_psize = zdde->zdde_ref_psize / refcnt;
7132 dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
7133
7134 dds.dds_ref_blocks = zdde->zdde_ref_blocks;
7135 dds.dds_ref_lsize = zdde->zdde_ref_lsize;
7136 dds.dds_ref_psize = zdde->zdde_ref_psize;
7137 dds.dds_ref_dsize = zdde->zdde_ref_dsize;
7138
7139 ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
7140 &dds, 0);
7141
7142 umem_free(zdde, sizeof (*zdde));
7143 }
7144
7145 avl_destroy(&t);
7146
7147 ddt_histogram_stat(&dds_total, &ddh_total);
7148
7149 (void) printf("Simulated DDT histogram:\n");
7150
7151 zpool_dump_ddt(&dds_total, &ddh_total);
7152
7153 dump_dedup_ratio(&dds_total);
7154 }
7155
7156 static int
7157 verify_device_removal_feature_counts(spa_t *spa)
7158 {
7159 uint64_t dr_feature_refcount = 0;
7160 uint64_t oc_feature_refcount = 0;
7161 uint64_t indirect_vdev_count = 0;
7162 uint64_t precise_vdev_count = 0;
7163 uint64_t obsolete_counts_object_count = 0;
7164 uint64_t obsolete_sm_count = 0;
7165 uint64_t obsolete_counts_count = 0;
7166 uint64_t scip_count = 0;
7167 uint64_t obsolete_bpobj_count = 0;
7168 int ret = 0;
7169
7170 spa_condensing_indirect_phys_t *scip =
7171 &spa->spa_condensing_indirect_phys;
7172 if (scip->scip_next_mapping_object != 0) {
7173 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
7174 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
7175 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
7176
7177 (void) printf("Condensing indirect vdev %llu: new mapping "
7178 "object %llu, prev obsolete sm %llu\n",
7179 (u_longlong_t)scip->scip_vdev,
7180 (u_longlong_t)scip->scip_next_mapping_object,
7181 (u_longlong_t)scip->scip_prev_obsolete_sm_object);
7182 if (scip->scip_prev_obsolete_sm_object != 0) {
7183 space_map_t *prev_obsolete_sm = NULL;
7184 VERIFY0(space_map_open(&prev_obsolete_sm,
7185 spa->spa_meta_objset,
7186 scip->scip_prev_obsolete_sm_object,
7187 0, vd->vdev_asize, 0));
7188 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
7189 (void) printf("\n");
7190 space_map_close(prev_obsolete_sm);
7191 }
7192
7193 scip_count += 2;
7194 }
7195
7196 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
7197 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
7198 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
7199
7200 if (vic->vic_mapping_object != 0) {
7201 ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
7202 vd->vdev_removing);
7203 indirect_vdev_count++;
7204
7205 if (vd->vdev_indirect_mapping->vim_havecounts) {
7206 obsolete_counts_count++;
7207 }
7208 }
7209
7210 boolean_t are_precise;
7211 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
7212 if (are_precise) {
7213 ASSERT(vic->vic_mapping_object != 0);
7214 precise_vdev_count++;
7215 }
7216
7217 uint64_t obsolete_sm_object;
7218 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
7219 if (obsolete_sm_object != 0) {
7220 ASSERT(vic->vic_mapping_object != 0);
7221 obsolete_sm_count++;
7222 }
7223 }
7224
7225 (void) feature_get_refcount(spa,
7226 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
7227 &dr_feature_refcount);
7228 (void) feature_get_refcount(spa,
7229 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
7230 &oc_feature_refcount);
7231
7232 if (dr_feature_refcount != indirect_vdev_count) {
7233 ret = 1;
7234 (void) printf("Number of indirect vdevs (%llu) " \
7235 "does not match feature count (%llu)\n",
7236 (u_longlong_t)indirect_vdev_count,
7237 (u_longlong_t)dr_feature_refcount);
7238 } else {
7239 (void) printf("Verified device_removal feature refcount " \
7240 "of %llu is correct\n",
7241 (u_longlong_t)dr_feature_refcount);
7242 }
7243
7244 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
7245 DMU_POOL_OBSOLETE_BPOBJ) == 0) {
7246 obsolete_bpobj_count++;
7247 }
7248
7249
7250 obsolete_counts_object_count = precise_vdev_count;
7251 obsolete_counts_object_count += obsolete_sm_count;
7252 obsolete_counts_object_count += obsolete_counts_count;
7253 obsolete_counts_object_count += scip_count;
7254 obsolete_counts_object_count += obsolete_bpobj_count;
7255 obsolete_counts_object_count += remap_deadlist_count;
7256
7257 if (oc_feature_refcount != obsolete_counts_object_count) {
7258 ret = 1;
7259 (void) printf("Number of obsolete counts objects (%llu) " \
7260 "does not match feature count (%llu)\n",
7261 (u_longlong_t)obsolete_counts_object_count,
7262 (u_longlong_t)oc_feature_refcount);
7263 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
7264 "ob:%llu rd:%llu\n",
7265 (u_longlong_t)precise_vdev_count,
7266 (u_longlong_t)obsolete_sm_count,
7267 (u_longlong_t)obsolete_counts_count,
7268 (u_longlong_t)scip_count,
7269 (u_longlong_t)obsolete_bpobj_count,
7270 (u_longlong_t)remap_deadlist_count);
7271 } else {
7272 (void) printf("Verified indirect_refcount feature refcount " \
7273 "of %llu is correct\n",
7274 (u_longlong_t)oc_feature_refcount);
7275 }
7276 return (ret);
7277 }
7278
7279 static void
7280 zdb_set_skip_mmp(char *target)
7281 {
7282 spa_t *spa;
7283
7284 /*
7285 * Disable the activity check to allow examination of
7286 * active pools.
7287 */
7288 mutex_enter(&spa_namespace_lock);
7289 if ((spa = spa_lookup(target)) != NULL) {
7290 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
7291 }
7292 mutex_exit(&spa_namespace_lock);
7293 }
7294
7295 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
7296 /*
7297 * Import the checkpointed state of the pool specified by the target
7298 * parameter as readonly. The function also accepts a pool config
7299 * as an optional parameter, else it attempts to infer the config by
7300 * the name of the target pool.
7301 *
7302 * Note that the checkpointed state's pool name will be the name of
7303 * the original pool with the above suffix appended to it. In addition,
7304 * if the target is not a pool name (e.g. a path to a dataset) then
7305 * the new_path parameter is populated with the updated path to
7306 * reflect the fact that we are looking into the checkpointed state.
7307 *
7308 * The function returns a newly-allocated copy of the name of the
7309 * pool containing the checkpointed state. When this copy is no
7310 * longer needed it should be freed with free(3C). Same thing
7311 * applies to the new_path parameter if allocated.
7312 */
7313 static char *
7314 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
7315 {
7316 int error = 0;
7317 char *poolname, *bogus_name = NULL;
7318 boolean_t freecfg = B_FALSE;
7319
7320 /* If the target is not a pool, the extract the pool name */
7321 char *path_start = strchr(target, '/');
7322 if (path_start != NULL) {
7323 size_t poolname_len = path_start - target;
7324 poolname = strndup(target, poolname_len);
7325 } else {
7326 poolname = target;
7327 }
7328
7329 if (cfg == NULL) {
7330 zdb_set_skip_mmp(poolname);
7331 error = spa_get_stats(poolname, &cfg, NULL, 0);
7332 if (error != 0) {
7333 fatal("Tried to read config of pool \"%s\" but "
7334 "spa_get_stats() failed with error %d\n",
7335 poolname, error);
7336 }
7337 freecfg = B_TRUE;
7338 }
7339
7340 if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
7341 if (target != poolname)
7342 free(poolname);
7343 return (NULL);
7344 }
7345 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
7346
7347 error = spa_import(bogus_name, cfg, NULL,
7348 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
7349 ZFS_IMPORT_SKIP_MMP);
7350 if (freecfg)
7351 nvlist_free(cfg);
7352 if (error != 0) {
7353 fatal("Tried to import pool \"%s\" but spa_import() failed "
7354 "with error %d\n", bogus_name, error);
7355 }
7356
7357 if (new_path != NULL && path_start != NULL) {
7358 if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
7359 free(bogus_name);
7360 if (path_start != NULL)
7361 free(poolname);
7362 return (NULL);
7363 }
7364 }
7365
7366 if (target != poolname)
7367 free(poolname);
7368
7369 return (bogus_name);
7370 }
7371
7372 typedef struct verify_checkpoint_sm_entry_cb_arg {
7373 vdev_t *vcsec_vd;
7374
7375 /* the following fields are only used for printing progress */
7376 uint64_t vcsec_entryid;
7377 uint64_t vcsec_num_entries;
7378 } verify_checkpoint_sm_entry_cb_arg_t;
7379
7380 #define ENTRIES_PER_PROGRESS_UPDATE 10000
7381
7382 static int
7383 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
7384 {
7385 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
7386 vdev_t *vd = vcsec->vcsec_vd;
7387 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
7388 uint64_t end = sme->sme_offset + sme->sme_run;
7389
7390 ASSERT(sme->sme_type == SM_FREE);
7391
7392 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
7393 (void) fprintf(stderr,
7394 "\rverifying vdev %llu, space map entry %llu of %llu ...",
7395 (longlong_t)vd->vdev_id,
7396 (longlong_t)vcsec->vcsec_entryid,
7397 (longlong_t)vcsec->vcsec_num_entries);
7398 }
7399 vcsec->vcsec_entryid++;
7400
7401 /*
7402 * See comment in checkpoint_sm_exclude_entry_cb()
7403 */
7404 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
7405 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
7406
7407 /*
7408 * The entries in the vdev_checkpoint_sm should be marked as
7409 * allocated in the checkpointed state of the pool, therefore
7410 * their respective ms_allocateable trees should not contain them.
7411 */
7412 mutex_enter(&ms->ms_lock);
7413 range_tree_verify_not_present(ms->ms_allocatable,
7414 sme->sme_offset, sme->sme_run);
7415 mutex_exit(&ms->ms_lock);
7416
7417 return (0);
7418 }
7419
7420 /*
7421 * Verify that all segments in the vdev_checkpoint_sm are allocated
7422 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
7423 * ms_allocatable).
7424 *
7425 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
7426 * each vdev in the current state of the pool to the metaslab space maps
7427 * (ms_sm) of the checkpointed state of the pool.
7428 *
7429 * Note that the function changes the state of the ms_allocatable
7430 * trees of the current spa_t. The entries of these ms_allocatable
7431 * trees are cleared out and then repopulated from with the free
7432 * entries of their respective ms_sm space maps.
7433 */
7434 static void
7435 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
7436 {
7437 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
7438 vdev_t *current_rvd = current->spa_root_vdev;
7439
7440 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
7441
7442 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
7443 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
7444 vdev_t *current_vd = current_rvd->vdev_child[c];
7445
7446 space_map_t *checkpoint_sm = NULL;
7447 uint64_t checkpoint_sm_obj;
7448
7449 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
7450 /*
7451 * Since we don't allow device removal in a pool
7452 * that has a checkpoint, we expect that all removed
7453 * vdevs were removed from the pool before the
7454 * checkpoint.
7455 */
7456 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
7457 continue;
7458 }
7459
7460 /*
7461 * If the checkpoint space map doesn't exist, then nothing
7462 * here is checkpointed so there's nothing to verify.
7463 */
7464 if (current_vd->vdev_top_zap == 0 ||
7465 zap_contains(spa_meta_objset(current),
7466 current_vd->vdev_top_zap,
7467 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
7468 continue;
7469
7470 VERIFY0(zap_lookup(spa_meta_objset(current),
7471 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
7472 sizeof (uint64_t), 1, &checkpoint_sm_obj));
7473
7474 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
7475 checkpoint_sm_obj, 0, current_vd->vdev_asize,
7476 current_vd->vdev_ashift));
7477
7478 verify_checkpoint_sm_entry_cb_arg_t vcsec;
7479 vcsec.vcsec_vd = ckpoint_vd;
7480 vcsec.vcsec_entryid = 0;
7481 vcsec.vcsec_num_entries =
7482 space_map_length(checkpoint_sm) / sizeof (uint64_t);
7483 VERIFY0(space_map_iterate(checkpoint_sm,
7484 space_map_length(checkpoint_sm),
7485 verify_checkpoint_sm_entry_cb, &vcsec));
7486 if (dump_opt['m'] > 3)
7487 dump_spacemap(current->spa_meta_objset, checkpoint_sm);
7488 space_map_close(checkpoint_sm);
7489 }
7490
7491 /*
7492 * If we've added vdevs since we took the checkpoint, ensure
7493 * that their checkpoint space maps are empty.
7494 */
7495 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
7496 for (uint64_t c = ckpoint_rvd->vdev_children;
7497 c < current_rvd->vdev_children; c++) {
7498 vdev_t *current_vd = current_rvd->vdev_child[c];
7499 VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL);
7500 }
7501 }
7502
7503 /* for cleaner progress output */
7504 (void) fprintf(stderr, "\n");
7505 }
7506
7507 /*
7508 * Verifies that all space that's allocated in the checkpoint is
7509 * still allocated in the current version, by checking that everything
7510 * in checkpoint's ms_allocatable (which is actually allocated, not
7511 * allocatable/free) is not present in current's ms_allocatable.
7512 *
7513 * Note that the function changes the state of the ms_allocatable
7514 * trees of both spas when called. The entries of all ms_allocatable
7515 * trees are cleared out and then repopulated from their respective
7516 * ms_sm space maps. In the checkpointed state we load the allocated
7517 * entries, and in the current state we load the free entries.
7518 */
7519 static void
7520 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
7521 {
7522 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
7523 vdev_t *current_rvd = current->spa_root_vdev;
7524
7525 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
7526 load_concrete_ms_allocatable_trees(current, SM_FREE);
7527
7528 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
7529 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
7530 vdev_t *current_vd = current_rvd->vdev_child[i];
7531
7532 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
7533 /*
7534 * See comment in verify_checkpoint_vdev_spacemaps()
7535 */
7536 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
7537 continue;
7538 }
7539
7540 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
7541 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
7542 metaslab_t *current_msp = current_vd->vdev_ms[m];
7543
7544 (void) fprintf(stderr,
7545 "\rverifying vdev %llu of %llu, "
7546 "metaslab %llu of %llu ...",
7547 (longlong_t)current_vd->vdev_id,
7548 (longlong_t)current_rvd->vdev_children,
7549 (longlong_t)current_vd->vdev_ms[m]->ms_id,
7550 (longlong_t)current_vd->vdev_ms_count);
7551
7552 /*
7553 * We walk through the ms_allocatable trees that
7554 * are loaded with the allocated blocks from the
7555 * ms_sm spacemaps of the checkpoint. For each
7556 * one of these ranges we ensure that none of them
7557 * exists in the ms_allocatable trees of the
7558 * current state which are loaded with the ranges
7559 * that are currently free.
7560 *
7561 * This way we ensure that none of the blocks that
7562 * are part of the checkpoint were freed by mistake.
7563 */
7564 range_tree_walk(ckpoint_msp->ms_allocatable,
7565 (range_tree_func_t *)range_tree_verify_not_present,
7566 current_msp->ms_allocatable);
7567 }
7568 }
7569
7570 /* for cleaner progress output */
7571 (void) fprintf(stderr, "\n");
7572 }
7573
7574 static void
7575 verify_checkpoint_blocks(spa_t *spa)
7576 {
7577 ASSERT(!dump_opt['L']);
7578
7579 spa_t *checkpoint_spa;
7580 char *checkpoint_pool;
7581 int error = 0;
7582
7583 /*
7584 * We import the checkpointed state of the pool (under a different
7585 * name) so we can do verification on it against the current state
7586 * of the pool.
7587 */
7588 checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
7589 NULL);
7590 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
7591
7592 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
7593 if (error != 0) {
7594 fatal("Tried to open pool \"%s\" but spa_open() failed with "
7595 "error %d\n", checkpoint_pool, error);
7596 }
7597
7598 /*
7599 * Ensure that ranges in the checkpoint space maps of each vdev
7600 * are allocated according to the checkpointed state's metaslab
7601 * space maps.
7602 */
7603 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
7604
7605 /*
7606 * Ensure that allocated ranges in the checkpoint's metaslab
7607 * space maps remain allocated in the metaslab space maps of
7608 * the current state.
7609 */
7610 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
7611
7612 /*
7613 * Once we are done, we get rid of the checkpointed state.
7614 */
7615 spa_close(checkpoint_spa, FTAG);
7616 free(checkpoint_pool);
7617 }
7618
7619 static void
7620 dump_leftover_checkpoint_blocks(spa_t *spa)
7621 {
7622 vdev_t *rvd = spa->spa_root_vdev;
7623
7624 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
7625 vdev_t *vd = rvd->vdev_child[i];
7626
7627 space_map_t *checkpoint_sm = NULL;
7628 uint64_t checkpoint_sm_obj;
7629
7630 if (vd->vdev_top_zap == 0)
7631 continue;
7632
7633 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
7634 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
7635 continue;
7636
7637 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
7638 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
7639 sizeof (uint64_t), 1, &checkpoint_sm_obj));
7640
7641 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
7642 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
7643 dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
7644 space_map_close(checkpoint_sm);
7645 }
7646 }
7647
7648 static int
7649 verify_checkpoint(spa_t *spa)
7650 {
7651 uberblock_t checkpoint;
7652 int error;
7653
7654 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
7655 return (0);
7656
7657 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
7658 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
7659 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
7660
7661 if (error == ENOENT && !dump_opt['L']) {
7662 /*
7663 * If the feature is active but the uberblock is missing
7664 * then we must be in the middle of discarding the
7665 * checkpoint.
7666 */
7667 (void) printf("\nPartially discarded checkpoint "
7668 "state found:\n");
7669 if (dump_opt['m'] > 3)
7670 dump_leftover_checkpoint_blocks(spa);
7671 return (0);
7672 } else if (error != 0) {
7673 (void) printf("lookup error %d when looking for "
7674 "checkpointed uberblock in MOS\n", error);
7675 return (error);
7676 }
7677 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
7678
7679 if (checkpoint.ub_checkpoint_txg == 0) {
7680 (void) printf("\nub_checkpoint_txg not set in checkpointed "
7681 "uberblock\n");
7682 error = 3;
7683 }
7684
7685 if (error == 0 && !dump_opt['L'])
7686 verify_checkpoint_blocks(spa);
7687
7688 return (error);
7689 }
7690
7691 static void
7692 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
7693 {
7694 (void) arg;
7695 for (uint64_t i = start; i < size; i++) {
7696 (void) printf("MOS object %llu referenced but not allocated\n",
7697 (u_longlong_t)i);
7698 }
7699 }
7700
7701 static void
7702 mos_obj_refd(uint64_t obj)
7703 {
7704 if (obj != 0 && mos_refd_objs != NULL)
7705 range_tree_add(mos_refd_objs, obj, 1);
7706 }
7707
7708 /*
7709 * Call on a MOS object that may already have been referenced.
7710 */
7711 static void
7712 mos_obj_refd_multiple(uint64_t obj)
7713 {
7714 if (obj != 0 && mos_refd_objs != NULL &&
7715 !range_tree_contains(mos_refd_objs, obj, 1))
7716 range_tree_add(mos_refd_objs, obj, 1);
7717 }
7718
7719 static void
7720 mos_leak_vdev_top_zap(vdev_t *vd)
7721 {
7722 uint64_t ms_flush_data_obj;
7723 int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
7724 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
7725 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
7726 if (error == ENOENT)
7727 return;
7728 ASSERT0(error);
7729
7730 mos_obj_refd(ms_flush_data_obj);
7731 }
7732
7733 static void
7734 mos_leak_vdev(vdev_t *vd)
7735 {
7736 mos_obj_refd(vd->vdev_dtl_object);
7737 mos_obj_refd(vd->vdev_ms_array);
7738 mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
7739 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
7740 mos_obj_refd(vd->vdev_leaf_zap);
7741 if (vd->vdev_checkpoint_sm != NULL)
7742 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
7743 if (vd->vdev_indirect_mapping != NULL) {
7744 mos_obj_refd(vd->vdev_indirect_mapping->
7745 vim_phys->vimp_counts_object);
7746 }
7747 if (vd->vdev_obsolete_sm != NULL)
7748 mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
7749
7750 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
7751 metaslab_t *ms = vd->vdev_ms[m];
7752 mos_obj_refd(space_map_object(ms->ms_sm));
7753 }
7754
7755 if (vd->vdev_root_zap != 0)
7756 mos_obj_refd(vd->vdev_root_zap);
7757
7758 if (vd->vdev_top_zap != 0) {
7759 mos_obj_refd(vd->vdev_top_zap);
7760 mos_leak_vdev_top_zap(vd);
7761 }
7762
7763 for (uint64_t c = 0; c < vd->vdev_children; c++) {
7764 mos_leak_vdev(vd->vdev_child[c]);
7765 }
7766 }
7767
7768 static void
7769 mos_leak_log_spacemaps(spa_t *spa)
7770 {
7771 uint64_t spacemap_zap;
7772 int error = zap_lookup(spa_meta_objset(spa),
7773 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
7774 sizeof (spacemap_zap), 1, &spacemap_zap);
7775 if (error == ENOENT)
7776 return;
7777 ASSERT0(error);
7778
7779 mos_obj_refd(spacemap_zap);
7780 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
7781 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
7782 mos_obj_refd(sls->sls_sm_obj);
7783 }
7784
7785 static void
7786 errorlog_count_refd(objset_t *mos, uint64_t errlog)
7787 {
7788 zap_cursor_t zc;
7789 zap_attribute_t za;
7790 for (zap_cursor_init(&zc, mos, errlog);
7791 zap_cursor_retrieve(&zc, &za) == 0;
7792 zap_cursor_advance(&zc)) {
7793 mos_obj_refd(za.za_first_integer);
7794 }
7795 zap_cursor_fini(&zc);
7796 }
7797
7798 static int
7799 dump_mos_leaks(spa_t *spa)
7800 {
7801 int rv = 0;
7802 objset_t *mos = spa->spa_meta_objset;
7803 dsl_pool_t *dp = spa->spa_dsl_pool;
7804
7805 /* Visit and mark all referenced objects in the MOS */
7806
7807 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
7808 mos_obj_refd(spa->spa_pool_props_object);
7809 mos_obj_refd(spa->spa_config_object);
7810 mos_obj_refd(spa->spa_ddt_stat_object);
7811 mos_obj_refd(spa->spa_feat_desc_obj);
7812 mos_obj_refd(spa->spa_feat_enabled_txg_obj);
7813 mos_obj_refd(spa->spa_feat_for_read_obj);
7814 mos_obj_refd(spa->spa_feat_for_write_obj);
7815 mos_obj_refd(spa->spa_history);
7816 mos_obj_refd(spa->spa_errlog_last);
7817 mos_obj_refd(spa->spa_errlog_scrub);
7818
7819 if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
7820 errorlog_count_refd(mos, spa->spa_errlog_last);
7821 errorlog_count_refd(mos, spa->spa_errlog_scrub);
7822 }
7823
7824 mos_obj_refd(spa->spa_all_vdev_zaps);
7825 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
7826 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
7827 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
7828 bpobj_count_refd(&spa->spa_deferred_bpobj);
7829 mos_obj_refd(dp->dp_empty_bpobj);
7830 bpobj_count_refd(&dp->dp_obsolete_bpobj);
7831 bpobj_count_refd(&dp->dp_free_bpobj);
7832 mos_obj_refd(spa->spa_l2cache.sav_object);
7833 mos_obj_refd(spa->spa_spares.sav_object);
7834
7835 if (spa->spa_syncing_log_sm != NULL)
7836 mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
7837 mos_leak_log_spacemaps(spa);
7838
7839 mos_obj_refd(spa->spa_condensing_indirect_phys.
7840 scip_next_mapping_object);
7841 mos_obj_refd(spa->spa_condensing_indirect_phys.
7842 scip_prev_obsolete_sm_object);
7843 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
7844 vdev_indirect_mapping_t *vim =
7845 vdev_indirect_mapping_open(mos,
7846 spa->spa_condensing_indirect_phys.scip_next_mapping_object);
7847 mos_obj_refd(vim->vim_phys->vimp_counts_object);
7848 vdev_indirect_mapping_close(vim);
7849 }
7850 deleted_livelists_dump_mos(spa);
7851
7852 if (dp->dp_origin_snap != NULL) {
7853 dsl_dataset_t *ds;
7854
7855 dsl_pool_config_enter(dp, FTAG);
7856 VERIFY0(dsl_dataset_hold_obj(dp,
7857 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
7858 FTAG, &ds));
7859 count_ds_mos_objects(ds);
7860 dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
7861 dsl_dataset_rele(ds, FTAG);
7862 dsl_pool_config_exit(dp, FTAG);
7863
7864 count_ds_mos_objects(dp->dp_origin_snap);
7865 dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
7866 }
7867 count_dir_mos_objects(dp->dp_mos_dir);
7868 if (dp->dp_free_dir != NULL)
7869 count_dir_mos_objects(dp->dp_free_dir);
7870 if (dp->dp_leak_dir != NULL)
7871 count_dir_mos_objects(dp->dp_leak_dir);
7872
7873 mos_leak_vdev(spa->spa_root_vdev);
7874
7875 for (uint64_t class = 0; class < DDT_CLASSES; class++) {
7876 for (uint64_t type = 0; type < DDT_TYPES; type++) {
7877 for (uint64_t cksum = 0;
7878 cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
7879 ddt_t *ddt = spa->spa_ddt[cksum];
7880 mos_obj_refd(ddt->ddt_object[type][class]);
7881 }
7882 }
7883 }
7884
7885 /*
7886 * Visit all allocated objects and make sure they are referenced.
7887 */
7888 uint64_t object = 0;
7889 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
7890 if (range_tree_contains(mos_refd_objs, object, 1)) {
7891 range_tree_remove(mos_refd_objs, object, 1);
7892 } else {
7893 dmu_object_info_t doi;
7894 const char *name;
7895 VERIFY0(dmu_object_info(mos, object, &doi));
7896 if (doi.doi_type & DMU_OT_NEWTYPE) {
7897 dmu_object_byteswap_t bswap =
7898 DMU_OT_BYTESWAP(doi.doi_type);
7899 name = dmu_ot_byteswap[bswap].ob_name;
7900 } else {
7901 name = dmu_ot[doi.doi_type].ot_name;
7902 }
7903
7904 (void) printf("MOS object %llu (%s) leaked\n",
7905 (u_longlong_t)object, name);
7906 rv = 2;
7907 }
7908 }
7909 (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
7910 if (!range_tree_is_empty(mos_refd_objs))
7911 rv = 2;
7912 range_tree_vacate(mos_refd_objs, NULL, NULL);
7913 range_tree_destroy(mos_refd_objs);
7914 return (rv);
7915 }
7916
7917 typedef struct log_sm_obsolete_stats_arg {
7918 uint64_t lsos_current_txg;
7919
7920 uint64_t lsos_total_entries;
7921 uint64_t lsos_valid_entries;
7922
7923 uint64_t lsos_sm_entries;
7924 uint64_t lsos_valid_sm_entries;
7925 } log_sm_obsolete_stats_arg_t;
7926
7927 static int
7928 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
7929 uint64_t txg, void *arg)
7930 {
7931 log_sm_obsolete_stats_arg_t *lsos = arg;
7932
7933 uint64_t offset = sme->sme_offset;
7934 uint64_t vdev_id = sme->sme_vdev;
7935
7936 if (lsos->lsos_current_txg == 0) {
7937 /* this is the first log */
7938 lsos->lsos_current_txg = txg;
7939 } else if (lsos->lsos_current_txg < txg) {
7940 /* we just changed log - print stats and reset */
7941 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
7942 (u_longlong_t)lsos->lsos_valid_sm_entries,
7943 (u_longlong_t)lsos->lsos_sm_entries,
7944 (u_longlong_t)lsos->lsos_current_txg);
7945 lsos->lsos_valid_sm_entries = 0;
7946 lsos->lsos_sm_entries = 0;
7947 lsos->lsos_current_txg = txg;
7948 }
7949 ASSERT3U(lsos->lsos_current_txg, ==, txg);
7950
7951 lsos->lsos_sm_entries++;
7952 lsos->lsos_total_entries++;
7953
7954 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
7955 if (!vdev_is_concrete(vd))
7956 return (0);
7957
7958 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
7959 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
7960
7961 if (txg < metaslab_unflushed_txg(ms))
7962 return (0);
7963 lsos->lsos_valid_sm_entries++;
7964 lsos->lsos_valid_entries++;
7965 return (0);
7966 }
7967
7968 static void
7969 dump_log_spacemap_obsolete_stats(spa_t *spa)
7970 {
7971 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
7972 return;
7973
7974 log_sm_obsolete_stats_arg_t lsos = {0};
7975
7976 (void) printf("Log Space Map Obsolete Entry Statistics:\n");
7977
7978 iterate_through_spacemap_logs(spa,
7979 log_spacemap_obsolete_stats_cb, &lsos);
7980
7981 /* print stats for latest log */
7982 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
7983 (u_longlong_t)lsos.lsos_valid_sm_entries,
7984 (u_longlong_t)lsos.lsos_sm_entries,
7985 (u_longlong_t)lsos.lsos_current_txg);
7986
7987 (void) printf("%-8llu valid entries out of %-8llu - total\n\n",
7988 (u_longlong_t)lsos.lsos_valid_entries,
7989 (u_longlong_t)lsos.lsos_total_entries);
7990 }
7991
7992 static void
7993 dump_zpool(spa_t *spa)
7994 {
7995 dsl_pool_t *dp = spa_get_dsl(spa);
7996 int rc = 0;
7997
7998 if (dump_opt['y']) {
7999 livelist_metaslab_validate(spa);
8000 }
8001
8002 if (dump_opt['S']) {
8003 dump_simulated_ddt(spa);
8004 return;
8005 }
8006
8007 if (!dump_opt['e'] && dump_opt['C'] > 1) {
8008 (void) printf("\nCached configuration:\n");
8009 dump_nvlist(spa->spa_config, 8);
8010 }
8011
8012 if (dump_opt['C'])
8013 dump_config(spa);
8014
8015 if (dump_opt['u'])
8016 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
8017
8018 if (dump_opt['D'])
8019 dump_all_ddts(spa);
8020
8021 if (dump_opt['d'] > 2 || dump_opt['m'])
8022 dump_metaslabs(spa);
8023 if (dump_opt['M'])
8024 dump_metaslab_groups(spa, dump_opt['M'] > 1);
8025 if (dump_opt['d'] > 2 || dump_opt['m']) {
8026 dump_log_spacemaps(spa);
8027 dump_log_spacemap_obsolete_stats(spa);
8028 }
8029
8030 if (dump_opt['d'] || dump_opt['i']) {
8031 spa_feature_t f;
8032 mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
8033 0);
8034 dump_objset(dp->dp_meta_objset);
8035
8036 if (dump_opt['d'] >= 3) {
8037 dsl_pool_t *dp = spa->spa_dsl_pool;
8038 dump_full_bpobj(&spa->spa_deferred_bpobj,
8039 "Deferred frees", 0);
8040 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
8041 dump_full_bpobj(&dp->dp_free_bpobj,
8042 "Pool snapshot frees", 0);
8043 }
8044 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
8045 ASSERT(spa_feature_is_enabled(spa,
8046 SPA_FEATURE_DEVICE_REMOVAL));
8047 dump_full_bpobj(&dp->dp_obsolete_bpobj,
8048 "Pool obsolete blocks", 0);
8049 }
8050
8051 if (spa_feature_is_active(spa,
8052 SPA_FEATURE_ASYNC_DESTROY)) {
8053 dump_bptree(spa->spa_meta_objset,
8054 dp->dp_bptree_obj,
8055 "Pool dataset frees");
8056 }
8057 dump_dtl(spa->spa_root_vdev, 0);
8058 }
8059
8060 for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
8061 global_feature_count[f] = UINT64_MAX;
8062 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
8063 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
8064 global_feature_count[SPA_FEATURE_LIVELIST] = 0;
8065
8066 (void) dmu_objset_find(spa_name(spa), dump_one_objset,
8067 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
8068
8069 if (rc == 0 && !dump_opt['L'])
8070 rc = dump_mos_leaks(spa);
8071
8072 for (f = 0; f < SPA_FEATURES; f++) {
8073 uint64_t refcount;
8074
8075 uint64_t *arr;
8076 if (!(spa_feature_table[f].fi_flags &
8077 ZFEATURE_FLAG_PER_DATASET)) {
8078 if (global_feature_count[f] == UINT64_MAX)
8079 continue;
8080 if (!spa_feature_is_enabled(spa, f)) {
8081 ASSERT0(global_feature_count[f]);
8082 continue;
8083 }
8084 arr = global_feature_count;
8085 } else {
8086 if (!spa_feature_is_enabled(spa, f)) {
8087 ASSERT0(dataset_feature_count[f]);
8088 continue;
8089 }
8090 arr = dataset_feature_count;
8091 }
8092 if (feature_get_refcount(spa, &spa_feature_table[f],
8093 &refcount) == ENOTSUP)
8094 continue;
8095 if (arr[f] != refcount) {
8096 (void) printf("%s feature refcount mismatch: "
8097 "%lld consumers != %lld refcount\n",
8098 spa_feature_table[f].fi_uname,
8099 (longlong_t)arr[f], (longlong_t)refcount);
8100 rc = 2;
8101 } else {
8102 (void) printf("Verified %s feature refcount "
8103 "of %llu is correct\n",
8104 spa_feature_table[f].fi_uname,
8105 (longlong_t)refcount);
8106 }
8107 }
8108
8109 if (rc == 0)
8110 rc = verify_device_removal_feature_counts(spa);
8111 }
8112
8113 if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
8114 rc = dump_block_stats(spa);
8115
8116 if (rc == 0)
8117 rc = verify_spacemap_refcounts(spa);
8118
8119 if (dump_opt['s'])
8120 show_pool_stats(spa);
8121
8122 if (dump_opt['h'])
8123 dump_history(spa);
8124
8125 if (rc == 0)
8126 rc = verify_checkpoint(spa);
8127
8128 if (rc != 0) {
8129 dump_debug_buffer();
8130 exit(rc);
8131 }
8132 }
8133
8134 #define ZDB_FLAG_CHECKSUM 0x0001
8135 #define ZDB_FLAG_DECOMPRESS 0x0002
8136 #define ZDB_FLAG_BSWAP 0x0004
8137 #define ZDB_FLAG_GBH 0x0008
8138 #define ZDB_FLAG_INDIRECT 0x0010
8139 #define ZDB_FLAG_RAW 0x0020
8140 #define ZDB_FLAG_PRINT_BLKPTR 0x0040
8141 #define ZDB_FLAG_VERBOSE 0x0080
8142
8143 static int flagbits[256];
8144 static char flagbitstr[16];
8145
8146 static void
8147 zdb_print_blkptr(const blkptr_t *bp, int flags)
8148 {
8149 char blkbuf[BP_SPRINTF_LEN];
8150
8151 if (flags & ZDB_FLAG_BSWAP)
8152 byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
8153
8154 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
8155 (void) printf("%s\n", blkbuf);
8156 }
8157
8158 static void
8159 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
8160 {
8161 int i;
8162
8163 for (i = 0; i < nbps; i++)
8164 zdb_print_blkptr(&bp[i], flags);
8165 }
8166
8167 static void
8168 zdb_dump_gbh(void *buf, int flags)
8169 {
8170 zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
8171 }
8172
8173 static void
8174 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
8175 {
8176 if (flags & ZDB_FLAG_BSWAP)
8177 byteswap_uint64_array(buf, size);
8178 VERIFY(write(fileno(stdout), buf, size) == size);
8179 }
8180
8181 static void
8182 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
8183 {
8184 uint64_t *d = (uint64_t *)buf;
8185 unsigned nwords = size / sizeof (uint64_t);
8186 int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
8187 unsigned i, j;
8188 const char *hdr;
8189 char *c;
8190
8191
8192 if (do_bswap)
8193 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
8194 else
8195 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
8196
8197 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
8198
8199 #ifdef _LITTLE_ENDIAN
8200 /* correct the endianness */
8201 do_bswap = !do_bswap;
8202 #endif
8203 for (i = 0; i < nwords; i += 2) {
8204 (void) printf("%06llx: %016llx %016llx ",
8205 (u_longlong_t)(i * sizeof (uint64_t)),
8206 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
8207 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
8208
8209 c = (char *)&d[i];
8210 for (j = 0; j < 2 * sizeof (uint64_t); j++)
8211 (void) printf("%c", isprint(c[j]) ? c[j] : '.');
8212 (void) printf("\n");
8213 }
8214 }
8215
8216 /*
8217 * There are two acceptable formats:
8218 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
8219 * child[.child]* - For example: 0.1.1
8220 *
8221 * The second form can be used to specify arbitrary vdevs anywhere
8222 * in the hierarchy. For example, in a pool with a mirror of
8223 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
8224 */
8225 static vdev_t *
8226 zdb_vdev_lookup(vdev_t *vdev, const char *path)
8227 {
8228 char *s, *p, *q;
8229 unsigned i;
8230
8231 if (vdev == NULL)
8232 return (NULL);
8233
8234 /* First, assume the x.x.x.x format */
8235 i = strtoul(path, &s, 10);
8236 if (s == path || (s && *s != '.' && *s != '\0'))
8237 goto name;
8238 if (i >= vdev->vdev_children)
8239 return (NULL);
8240
8241 vdev = vdev->vdev_child[i];
8242 if (s && *s == '\0')
8243 return (vdev);
8244 return (zdb_vdev_lookup(vdev, s+1));
8245
8246 name:
8247 for (i = 0; i < vdev->vdev_children; i++) {
8248 vdev_t *vc = vdev->vdev_child[i];
8249
8250 if (vc->vdev_path == NULL) {
8251 vc = zdb_vdev_lookup(vc, path);
8252 if (vc == NULL)
8253 continue;
8254 else
8255 return (vc);
8256 }
8257
8258 p = strrchr(vc->vdev_path, '/');
8259 p = p ? p + 1 : vc->vdev_path;
8260 q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
8261
8262 if (strcmp(vc->vdev_path, path) == 0)
8263 return (vc);
8264 if (strcmp(p, path) == 0)
8265 return (vc);
8266 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
8267 return (vc);
8268 }
8269
8270 return (NULL);
8271 }
8272
8273 static int
8274 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
8275 {
8276 dsl_dataset_t *ds;
8277
8278 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
8279 int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
8280 NULL, &ds);
8281 if (error != 0) {
8282 (void) fprintf(stderr, "failed to hold objset %llu: %s\n",
8283 (u_longlong_t)objset_id, strerror(error));
8284 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
8285 return (error);
8286 }
8287 dsl_dataset_name(ds, outstr);
8288 dsl_dataset_rele(ds, NULL);
8289 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
8290 return (0);
8291 }
8292
8293 static boolean_t
8294 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
8295 {
8296 char *s0, *s1, *tmp = NULL;
8297
8298 if (sizes == NULL)
8299 return (B_FALSE);
8300
8301 s0 = strtok_r(sizes, "/", &tmp);
8302 if (s0 == NULL)
8303 return (B_FALSE);
8304 s1 = strtok_r(NULL, "/", &tmp);
8305 *lsize = strtoull(s0, NULL, 16);
8306 *psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
8307 return (*lsize >= *psize && *psize > 0);
8308 }
8309
8310 #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg))
8311
8312 static boolean_t
8313 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
8314 uint64_t psize, int flags)
8315 {
8316 (void) buf;
8317 boolean_t exceeded = B_FALSE;
8318 /*
8319 * We don't know how the data was compressed, so just try
8320 * every decompress function at every inflated blocksize.
8321 */
8322 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
8323 int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
8324 int *cfuncp = cfuncs;
8325 uint64_t maxlsize = SPA_MAXBLOCKSIZE;
8326 uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
8327 ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
8328 (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0);
8329 *cfuncp++ = ZIO_COMPRESS_LZ4;
8330 *cfuncp++ = ZIO_COMPRESS_LZJB;
8331 mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
8332 for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
8333 if (((1ULL << c) & mask) == 0)
8334 *cfuncp++ = c;
8335
8336 /*
8337 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
8338 * could take a while and we should let the user know
8339 * we are not stuck. On the other hand, printing progress
8340 * info gets old after a while. User can specify 'v' flag
8341 * to see the progression.
8342 */
8343 if (lsize == psize)
8344 lsize += SPA_MINBLOCKSIZE;
8345 else
8346 maxlsize = lsize;
8347 for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
8348 for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
8349 if (flags & ZDB_FLAG_VERBOSE) {
8350 (void) fprintf(stderr,
8351 "Trying %05llx -> %05llx (%s)\n",
8352 (u_longlong_t)psize,
8353 (u_longlong_t)lsize,
8354 zio_compress_table[*cfuncp].\
8355 ci_name);
8356 }
8357
8358 /*
8359 * We randomize lbuf2, and decompress to both
8360 * lbuf and lbuf2. This way, we will know if
8361 * decompression fill exactly to lsize.
8362 */
8363 VERIFY0(random_get_pseudo_bytes(lbuf2, lsize));
8364
8365 if (zio_decompress_data(*cfuncp, pabd,
8366 lbuf, psize, lsize, NULL) == 0 &&
8367 zio_decompress_data(*cfuncp, pabd,
8368 lbuf2, psize, lsize, NULL) == 0 &&
8369 memcmp(lbuf, lbuf2, lsize) == 0)
8370 break;
8371 }
8372 if (*cfuncp != 0)
8373 break;
8374 }
8375 umem_free(lbuf2, SPA_MAXBLOCKSIZE);
8376
8377 if (lsize > maxlsize) {
8378 exceeded = B_TRUE;
8379 }
8380 if (*cfuncp == ZIO_COMPRESS_ZLE) {
8381 printf("\nZLE decompression was selected. If you "
8382 "suspect the results are wrong,\ntry avoiding ZLE "
8383 "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
8384 }
8385
8386 return (exceeded);
8387 }
8388
8389 /*
8390 * Read a block from a pool and print it out. The syntax of the
8391 * block descriptor is:
8392 *
8393 * pool:vdev_specifier:offset:[lsize/]psize[:flags]
8394 *
8395 * pool - The name of the pool you wish to read from
8396 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
8397 * offset - offset, in hex, in bytes
8398 * size - Amount of data to read, in hex, in bytes
8399 * flags - A string of characters specifying options
8400 * b: Decode a blkptr at given offset within block
8401 * c: Calculate and display checksums
8402 * d: Decompress data before dumping
8403 * e: Byteswap data before dumping
8404 * g: Display data as a gang block header
8405 * i: Display as an indirect block
8406 * r: Dump raw data to stdout
8407 * v: Verbose
8408 *
8409 */
8410 static void
8411 zdb_read_block(char *thing, spa_t *spa)
8412 {
8413 blkptr_t blk, *bp = &blk;
8414 dva_t *dva = bp->blk_dva;
8415 int flags = 0;
8416 uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
8417 zio_t *zio;
8418 vdev_t *vd;
8419 abd_t *pabd;
8420 void *lbuf, *buf;
8421 char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
8422 const char *vdev, *errmsg = NULL;
8423 int i, error;
8424 boolean_t borrowed = B_FALSE, found = B_FALSE;
8425
8426 dup = strdup(thing);
8427 s = strtok_r(dup, ":", &tmp);
8428 vdev = s ?: "";
8429 s = strtok_r(NULL, ":", &tmp);
8430 offset = strtoull(s ? s : "", NULL, 16);
8431 sizes = strtok_r(NULL, ":", &tmp);
8432 s = strtok_r(NULL, ":", &tmp);
8433 flagstr = strdup(s ?: "");
8434
8435 if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
8436 errmsg = "invalid size(s)";
8437 if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
8438 errmsg = "size must be a multiple of sector size";
8439 if (!IS_P2ALIGNED(offset, DEV_BSIZE))
8440 errmsg = "offset must be a multiple of sector size";
8441 if (errmsg) {
8442 (void) printf("Invalid block specifier: %s - %s\n",
8443 thing, errmsg);
8444 goto done;
8445 }
8446
8447 tmp = NULL;
8448 for (s = strtok_r(flagstr, ":", &tmp);
8449 s != NULL;
8450 s = strtok_r(NULL, ":", &tmp)) {
8451 for (i = 0; i < strlen(flagstr); i++) {
8452 int bit = flagbits[(uchar_t)flagstr[i]];
8453
8454 if (bit == 0) {
8455 (void) printf("***Ignoring flag: %c\n",
8456 (uchar_t)flagstr[i]);
8457 continue;
8458 }
8459 found = B_TRUE;
8460 flags |= bit;
8461
8462 p = &flagstr[i + 1];
8463 if (*p != ':' && *p != '\0') {
8464 int j = 0, nextbit = flagbits[(uchar_t)*p];
8465 char *end, offstr[8] = { 0 };
8466 if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
8467 (nextbit == 0)) {
8468 /* look ahead to isolate the offset */
8469 while (nextbit == 0 &&
8470 strchr(flagbitstr, *p) == NULL) {
8471 offstr[j] = *p;
8472 j++;
8473 if (i + j > strlen(flagstr))
8474 break;
8475 p++;
8476 nextbit = flagbits[(uchar_t)*p];
8477 }
8478 blkptr_offset = strtoull(offstr, &end,
8479 16);
8480 i += j;
8481 } else if (nextbit == 0) {
8482 (void) printf("***Ignoring flag arg:"
8483 " '%c'\n", (uchar_t)*p);
8484 }
8485 }
8486 }
8487 }
8488 if (blkptr_offset % sizeof (blkptr_t)) {
8489 printf("Block pointer offset 0x%llx "
8490 "must be divisible by 0x%x\n",
8491 (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
8492 goto done;
8493 }
8494 if (found == B_FALSE && strlen(flagstr) > 0) {
8495 printf("Invalid flag arg: '%s'\n", flagstr);
8496 goto done;
8497 }
8498
8499 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
8500 if (vd == NULL) {
8501 (void) printf("***Invalid vdev: %s\n", vdev);
8502 goto done;
8503 } else {
8504 if (vd->vdev_path)
8505 (void) fprintf(stderr, "Found vdev: %s\n",
8506 vd->vdev_path);
8507 else
8508 (void) fprintf(stderr, "Found vdev type: %s\n",
8509 vd->vdev_ops->vdev_op_type);
8510 }
8511
8512 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
8513 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
8514
8515 BP_ZERO(bp);
8516
8517 DVA_SET_VDEV(&dva[0], vd->vdev_id);
8518 DVA_SET_OFFSET(&dva[0], offset);
8519 DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
8520 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
8521
8522 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
8523
8524 BP_SET_LSIZE(bp, lsize);
8525 BP_SET_PSIZE(bp, psize);
8526 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
8527 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
8528 BP_SET_TYPE(bp, DMU_OT_NONE);
8529 BP_SET_LEVEL(bp, 0);
8530 BP_SET_DEDUP(bp, 0);
8531 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
8532
8533 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8534 zio = zio_root(spa, NULL, NULL, 0);
8535
8536 if (vd == vd->vdev_top) {
8537 /*
8538 * Treat this as a normal block read.
8539 */
8540 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
8541 ZIO_PRIORITY_SYNC_READ,
8542 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
8543 } else {
8544 /*
8545 * Treat this as a vdev child I/O.
8546 */
8547 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
8548 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
8549 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
8550 ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8551 ZIO_FLAG_OPTIONAL, NULL, NULL));
8552 }
8553
8554 error = zio_wait(zio);
8555 spa_config_exit(spa, SCL_STATE, FTAG);
8556
8557 if (error) {
8558 (void) printf("Read of %s failed, error: %d\n", thing, error);
8559 goto out;
8560 }
8561
8562 uint64_t orig_lsize = lsize;
8563 buf = lbuf;
8564 if (flags & ZDB_FLAG_DECOMPRESS) {
8565 boolean_t failed = zdb_decompress_block(pabd, buf, lbuf,
8566 lsize, psize, flags);
8567 if (failed) {
8568 (void) printf("Decompress of %s failed\n", thing);
8569 goto out;
8570 }
8571 } else {
8572 buf = abd_borrow_buf_copy(pabd, lsize);
8573 borrowed = B_TRUE;
8574 }
8575 /*
8576 * Try to detect invalid block pointer. If invalid, try
8577 * decompressing.
8578 */
8579 if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
8580 !(flags & ZDB_FLAG_DECOMPRESS)) {
8581 const blkptr_t *b = (const blkptr_t *)(void *)
8582 ((uintptr_t)buf + (uintptr_t)blkptr_offset);
8583 if (zfs_blkptr_verify(spa, b,
8584 BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
8585 abd_return_buf_copy(pabd, buf, lsize);
8586 borrowed = B_FALSE;
8587 buf = lbuf;
8588 boolean_t failed = zdb_decompress_block(pabd, buf,
8589 lbuf, lsize, psize, flags);
8590 b = (const blkptr_t *)(void *)
8591 ((uintptr_t)buf + (uintptr_t)blkptr_offset);
8592 if (failed || zfs_blkptr_verify(spa, b,
8593 BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
8594 printf("invalid block pointer at this DVA\n");
8595 goto out;
8596 }
8597 }
8598 }
8599
8600 if (flags & ZDB_FLAG_PRINT_BLKPTR)
8601 zdb_print_blkptr((blkptr_t *)(void *)
8602 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
8603 else if (flags & ZDB_FLAG_RAW)
8604 zdb_dump_block_raw(buf, lsize, flags);
8605 else if (flags & ZDB_FLAG_INDIRECT)
8606 zdb_dump_indirect((blkptr_t *)buf,
8607 orig_lsize / sizeof (blkptr_t), flags);
8608 else if (flags & ZDB_FLAG_GBH)
8609 zdb_dump_gbh(buf, flags);
8610 else
8611 zdb_dump_block(thing, buf, lsize, flags);
8612
8613 /*
8614 * If :c was specified, iterate through the checksum table to
8615 * calculate and display each checksum for our specified
8616 * DVA and length.
8617 */
8618 if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
8619 !(flags & ZDB_FLAG_GBH)) {
8620 zio_t *czio;
8621 (void) printf("\n");
8622 for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
8623 ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
8624
8625 if ((zio_checksum_table[ck].ci_flags &
8626 ZCHECKSUM_FLAG_EMBEDDED) ||
8627 ck == ZIO_CHECKSUM_NOPARITY) {
8628 continue;
8629 }
8630 BP_SET_CHECKSUM(bp, ck);
8631 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8632 czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
8633 czio->io_bp = bp;
8634
8635 if (vd == vd->vdev_top) {
8636 zio_nowait(zio_read(czio, spa, bp, pabd, psize,
8637 NULL, NULL,
8638 ZIO_PRIORITY_SYNC_READ,
8639 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8640 ZIO_FLAG_DONT_RETRY, NULL));
8641 } else {
8642 zio_nowait(zio_vdev_child_io(czio, bp, vd,
8643 offset, pabd, psize, ZIO_TYPE_READ,
8644 ZIO_PRIORITY_SYNC_READ,
8645 ZIO_FLAG_DONT_CACHE |
8646 ZIO_FLAG_DONT_PROPAGATE |
8647 ZIO_FLAG_DONT_RETRY |
8648 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8649 ZIO_FLAG_SPECULATIVE |
8650 ZIO_FLAG_OPTIONAL, NULL, NULL));
8651 }
8652 error = zio_wait(czio);
8653 if (error == 0 || error == ECKSUM) {
8654 zio_t *ck_zio = zio_root(spa, NULL, NULL, 0);
8655 ck_zio->io_offset =
8656 DVA_GET_OFFSET(&bp->blk_dva[0]);
8657 ck_zio->io_bp = bp;
8658 zio_checksum_compute(ck_zio, ck, pabd, lsize);
8659 printf(
8660 "%12s\t"
8661 "cksum=%016llx:%016llx:%016llx:%016llx\n",
8662 zio_checksum_table[ck].ci_name,
8663 (u_longlong_t)bp->blk_cksum.zc_word[0],
8664 (u_longlong_t)bp->blk_cksum.zc_word[1],
8665 (u_longlong_t)bp->blk_cksum.zc_word[2],
8666 (u_longlong_t)bp->blk_cksum.zc_word[3]);
8667 zio_wait(ck_zio);
8668 } else {
8669 printf("error %d reading block\n", error);
8670 }
8671 spa_config_exit(spa, SCL_STATE, FTAG);
8672 }
8673 }
8674
8675 if (borrowed)
8676 abd_return_buf_copy(pabd, buf, lsize);
8677
8678 out:
8679 abd_free(pabd);
8680 umem_free(lbuf, SPA_MAXBLOCKSIZE);
8681 done:
8682 free(flagstr);
8683 free(dup);
8684 }
8685
8686 static void
8687 zdb_embedded_block(char *thing)
8688 {
8689 blkptr_t bp = {{{{0}}}};
8690 unsigned long long *words = (void *)&bp;
8691 char *buf;
8692 int err;
8693
8694 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
8695 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
8696 words + 0, words + 1, words + 2, words + 3,
8697 words + 4, words + 5, words + 6, words + 7,
8698 words + 8, words + 9, words + 10, words + 11,
8699 words + 12, words + 13, words + 14, words + 15);
8700 if (err != 16) {
8701 (void) fprintf(stderr, "invalid input format\n");
8702 exit(1);
8703 }
8704 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
8705 buf = malloc(SPA_MAXBLOCKSIZE);
8706 if (buf == NULL) {
8707 (void) fprintf(stderr, "out of memory\n");
8708 exit(1);
8709 }
8710 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
8711 if (err != 0) {
8712 (void) fprintf(stderr, "decode failed: %u\n", err);
8713 exit(1);
8714 }
8715 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
8716 free(buf);
8717 }
8718
8719 /* check for valid hex or decimal numeric string */
8720 static boolean_t
8721 zdb_numeric(char *str)
8722 {
8723 int i = 0;
8724
8725 if (strlen(str) == 0)
8726 return (B_FALSE);
8727 if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
8728 i = 2;
8729 for (; i < strlen(str); i++) {
8730 if (!isxdigit(str[i]))
8731 return (B_FALSE);
8732 }
8733 return (B_TRUE);
8734 }
8735
8736 int
8737 main(int argc, char **argv)
8738 {
8739 int c;
8740 spa_t *spa = NULL;
8741 objset_t *os = NULL;
8742 int dump_all = 1;
8743 int verbose = 0;
8744 int error = 0;
8745 char **searchdirs = NULL;
8746 int nsearch = 0;
8747 char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
8748 nvlist_t *policy = NULL;
8749 uint64_t max_txg = UINT64_MAX;
8750 int64_t objset_id = -1;
8751 uint64_t object;
8752 int flags = ZFS_IMPORT_MISSING_LOG;
8753 int rewind = ZPOOL_NEVER_REWIND;
8754 char *spa_config_path_env, *objset_str;
8755 boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
8756 nvlist_t *cfg = NULL;
8757
8758 dprintf_setup(&argc, argv);
8759
8760 /*
8761 * If there is an environment variable SPA_CONFIG_PATH it overrides
8762 * default spa_config_path setting. If -U flag is specified it will
8763 * override this environment variable settings once again.
8764 */
8765 spa_config_path_env = getenv("SPA_CONFIG_PATH");
8766 if (spa_config_path_env != NULL)
8767 spa_config_path = spa_config_path_env;
8768
8769 /*
8770 * For performance reasons, we set this tunable down. We do so before
8771 * the arg parsing section so that the user can override this value if
8772 * they choose.
8773 */
8774 zfs_btree_verify_intensity = 3;
8775
8776 struct option long_options[] = {
8777 {"ignore-assertions", no_argument, NULL, 'A'},
8778 {"block-stats", no_argument, NULL, 'b'},
8779 {"backup", no_argument, NULL, 'B'},
8780 {"checksum", no_argument, NULL, 'c'},
8781 {"config", no_argument, NULL, 'C'},
8782 {"datasets", no_argument, NULL, 'd'},
8783 {"dedup-stats", no_argument, NULL, 'D'},
8784 {"exported", no_argument, NULL, 'e'},
8785 {"embedded-block-pointer", no_argument, NULL, 'E'},
8786 {"automatic-rewind", no_argument, NULL, 'F'},
8787 {"dump-debug-msg", no_argument, NULL, 'G'},
8788 {"history", no_argument, NULL, 'h'},
8789 {"intent-logs", no_argument, NULL, 'i'},
8790 {"inflight", required_argument, NULL, 'I'},
8791 {"checkpointed-state", no_argument, NULL, 'k'},
8792 {"key", required_argument, NULL, 'K'},
8793 {"label", no_argument, NULL, 'l'},
8794 {"disable-leak-tracking", no_argument, NULL, 'L'},
8795 {"metaslabs", no_argument, NULL, 'm'},
8796 {"metaslab-groups", no_argument, NULL, 'M'},
8797 {"numeric", no_argument, NULL, 'N'},
8798 {"option", required_argument, NULL, 'o'},
8799 {"object-lookups", no_argument, NULL, 'O'},
8800 {"path", required_argument, NULL, 'p'},
8801 {"parseable", no_argument, NULL, 'P'},
8802 {"skip-label", no_argument, NULL, 'q'},
8803 {"copy-object", no_argument, NULL, 'r'},
8804 {"read-block", no_argument, NULL, 'R'},
8805 {"io-stats", no_argument, NULL, 's'},
8806 {"simulate-dedup", no_argument, NULL, 'S'},
8807 {"txg", required_argument, NULL, 't'},
8808 {"uberblock", no_argument, NULL, 'u'},
8809 {"cachefile", required_argument, NULL, 'U'},
8810 {"verbose", no_argument, NULL, 'v'},
8811 {"verbatim", no_argument, NULL, 'V'},
8812 {"dump-blocks", required_argument, NULL, 'x'},
8813 {"extreme-rewind", no_argument, NULL, 'X'},
8814 {"all-reconstruction", no_argument, NULL, 'Y'},
8815 {"livelist", no_argument, NULL, 'y'},
8816 {"zstd-headers", no_argument, NULL, 'Z'},
8817 {0, 0, 0, 0}
8818 };
8819
8820 while ((c = getopt_long(argc, argv,
8821 "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
8822 long_options, NULL)) != -1) {
8823 switch (c) {
8824 case 'b':
8825 case 'B':
8826 case 'c':
8827 case 'C':
8828 case 'd':
8829 case 'D':
8830 case 'E':
8831 case 'G':
8832 case 'h':
8833 case 'i':
8834 case 'l':
8835 case 'm':
8836 case 'M':
8837 case 'N':
8838 case 'O':
8839 case 'r':
8840 case 'R':
8841 case 's':
8842 case 'S':
8843 case 'u':
8844 case 'y':
8845 case 'Z':
8846 dump_opt[c]++;
8847 dump_all = 0;
8848 break;
8849 case 'A':
8850 case 'e':
8851 case 'F':
8852 case 'k':
8853 case 'L':
8854 case 'P':
8855 case 'q':
8856 case 'X':
8857 dump_opt[c]++;
8858 break;
8859 case 'Y':
8860 zfs_reconstruct_indirect_combinations_max = INT_MAX;
8861 zfs_deadman_enabled = 0;
8862 break;
8863 /* NB: Sort single match options below. */
8864 case 'I':
8865 max_inflight_bytes = strtoull(optarg, NULL, 0);
8866 if (max_inflight_bytes == 0) {
8867 (void) fprintf(stderr, "maximum number "
8868 "of inflight bytes must be greater "
8869 "than 0\n");
8870 usage();
8871 }
8872 break;
8873 case 'K':
8874 dump_opt[c]++;
8875 key_material = strdup(optarg);
8876 /* redact key material in process table */
8877 while (*optarg != '\0') { *optarg++ = '*'; }
8878 break;
8879 case 'o':
8880 error = set_global_var(optarg);
8881 if (error != 0)
8882 usage();
8883 break;
8884 case 'p':
8885 if (searchdirs == NULL) {
8886 searchdirs = umem_alloc(sizeof (char *),
8887 UMEM_NOFAIL);
8888 } else {
8889 char **tmp = umem_alloc((nsearch + 1) *
8890 sizeof (char *), UMEM_NOFAIL);
8891 memcpy(tmp, searchdirs, nsearch *
8892 sizeof (char *));
8893 umem_free(searchdirs,
8894 nsearch * sizeof (char *));
8895 searchdirs = tmp;
8896 }
8897 searchdirs[nsearch++] = optarg;
8898 break;
8899 case 't':
8900 max_txg = strtoull(optarg, NULL, 0);
8901 if (max_txg < TXG_INITIAL) {
8902 (void) fprintf(stderr, "incorrect txg "
8903 "specified: %s\n", optarg);
8904 usage();
8905 }
8906 break;
8907 case 'U':
8908 spa_config_path = optarg;
8909 if (spa_config_path[0] != '/') {
8910 (void) fprintf(stderr,
8911 "cachefile must be an absolute path "
8912 "(i.e. start with a slash)\n");
8913 usage();
8914 }
8915 break;
8916 case 'v':
8917 verbose++;
8918 break;
8919 case 'V':
8920 flags = ZFS_IMPORT_VERBATIM;
8921 break;
8922 case 'x':
8923 vn_dumpdir = optarg;
8924 break;
8925 default:
8926 usage();
8927 break;
8928 }
8929 }
8930
8931 if (!dump_opt['e'] && searchdirs != NULL) {
8932 (void) fprintf(stderr, "-p option requires use of -e\n");
8933 usage();
8934 }
8935 #if defined(_LP64)
8936 /*
8937 * ZDB does not typically re-read blocks; therefore limit the ARC
8938 * to 256 MB, which can be used entirely for metadata.
8939 */
8940 zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
8941 zfs_arc_max = 256 * 1024 * 1024;
8942 #endif
8943
8944 /*
8945 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
8946 * "zdb -b" uses traversal prefetch which uses async reads.
8947 * For good performance, let several of them be active at once.
8948 */
8949 zfs_vdev_async_read_max_active = 10;
8950
8951 /*
8952 * Disable reference tracking for better performance.
8953 */
8954 reference_tracking_enable = B_FALSE;
8955
8956 /*
8957 * Do not fail spa_load when spa_load_verify fails. This is needed
8958 * to load non-idle pools.
8959 */
8960 spa_load_verify_dryrun = B_TRUE;
8961
8962 /*
8963 * ZDB should have ability to read spacemaps.
8964 */
8965 spa_mode_readable_spacemaps = B_TRUE;
8966
8967 kernel_init(SPA_MODE_READ);
8968
8969 if (dump_all)
8970 verbose = MAX(verbose, 1);
8971
8972 for (c = 0; c < 256; c++) {
8973 if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
8974 dump_opt[c] = 1;
8975 if (dump_opt[c])
8976 dump_opt[c] += verbose;
8977 }
8978
8979 libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
8980 zfs_recover = (dump_opt['A'] > 1);
8981
8982 argc -= optind;
8983 argv += optind;
8984 if (argc < 2 && dump_opt['R'])
8985 usage();
8986
8987 if (dump_opt['E']) {
8988 if (argc != 1)
8989 usage();
8990 zdb_embedded_block(argv[0]);
8991 return (0);
8992 }
8993
8994 if (argc < 1) {
8995 if (!dump_opt['e'] && dump_opt['C']) {
8996 dump_cachefile(spa_config_path);
8997 return (0);
8998 }
8999 usage();
9000 }
9001
9002 if (dump_opt['l'])
9003 return (dump_label(argv[0]));
9004
9005 if (dump_opt['O']) {
9006 if (argc != 2)
9007 usage();
9008 dump_opt['v'] = verbose + 3;
9009 return (dump_path(argv[0], argv[1], NULL));
9010 }
9011 if (dump_opt['r']) {
9012 target_is_spa = B_FALSE;
9013 if (argc != 3)
9014 usage();
9015 dump_opt['v'] = verbose;
9016 error = dump_path(argv[0], argv[1], &object);
9017 if (error != 0)
9018 fatal("internal error: %s", strerror(error));
9019 }
9020
9021 if (dump_opt['X'] || dump_opt['F'])
9022 rewind = ZPOOL_DO_REWIND |
9023 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
9024
9025 /* -N implies -d */
9026 if (dump_opt['N'] && dump_opt['d'] == 0)
9027 dump_opt['d'] = dump_opt['N'];
9028
9029 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
9030 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
9031 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
9032 fatal("internal error: %s", strerror(ENOMEM));
9033
9034 error = 0;
9035 target = argv[0];
9036
9037 if (strpbrk(target, "/@") != NULL) {
9038 size_t targetlen;
9039
9040 target_pool = strdup(target);
9041 *strpbrk(target_pool, "/@") = '\0';
9042
9043 target_is_spa = B_FALSE;
9044 targetlen = strlen(target);
9045 if (targetlen && target[targetlen - 1] == '/')
9046 target[targetlen - 1] = '\0';
9047
9048 /*
9049 * See if an objset ID was supplied (-d <pool>/<objset ID>).
9050 * To disambiguate tank/100, consider the 100 as objsetID
9051 * if -N was given, otherwise 100 is an objsetID iff
9052 * tank/100 as a named dataset fails on lookup.
9053 */
9054 objset_str = strchr(target, '/');
9055 if (objset_str && strlen(objset_str) > 1 &&
9056 zdb_numeric(objset_str + 1)) {
9057 char *endptr;
9058 errno = 0;
9059 objset_str++;
9060 objset_id = strtoull(objset_str, &endptr, 0);
9061 /* dataset 0 is the same as opening the pool */
9062 if (errno == 0 && endptr != objset_str &&
9063 objset_id != 0) {
9064 if (dump_opt['N'])
9065 dataset_lookup = B_TRUE;
9066 }
9067 /* normal dataset name not an objset ID */
9068 if (endptr == objset_str) {
9069 objset_id = -1;
9070 }
9071 } else if (objset_str && !zdb_numeric(objset_str + 1) &&
9072 dump_opt['N']) {
9073 printf("Supply a numeric objset ID with -N\n");
9074 exit(1);
9075 }
9076 } else {
9077 target_pool = target;
9078 }
9079
9080 if (dump_opt['e']) {
9081 importargs_t args = { 0 };
9082
9083 args.paths = nsearch;
9084 args.path = searchdirs;
9085 args.can_be_active = B_TRUE;
9086
9087 libpc_handle_t lpch = {
9088 .lpc_lib_handle = NULL,
9089 .lpc_ops = &libzpool_config_ops,
9090 .lpc_printerr = B_TRUE
9091 };
9092 error = zpool_find_config(&lpch, target_pool, &cfg, &args);
9093
9094 if (error == 0) {
9095
9096 if (nvlist_add_nvlist(cfg,
9097 ZPOOL_LOAD_POLICY, policy) != 0) {
9098 fatal("can't open '%s': %s",
9099 target, strerror(ENOMEM));
9100 }
9101
9102 if (dump_opt['C'] > 1) {
9103 (void) printf("\nConfiguration for import:\n");
9104 dump_nvlist(cfg, 8);
9105 }
9106
9107 /*
9108 * Disable the activity check to allow examination of
9109 * active pools.
9110 */
9111 error = spa_import(target_pool, cfg, NULL,
9112 flags | ZFS_IMPORT_SKIP_MMP);
9113 }
9114 }
9115
9116 if (searchdirs != NULL) {
9117 umem_free(searchdirs, nsearch * sizeof (char *));
9118 searchdirs = NULL;
9119 }
9120
9121 /*
9122 * import_checkpointed_state makes the assumption that the
9123 * target pool that we pass it is already part of the spa
9124 * namespace. Because of that we need to make sure to call
9125 * it always after the -e option has been processed, which
9126 * imports the pool to the namespace if it's not in the
9127 * cachefile.
9128 */
9129 char *checkpoint_pool = NULL;
9130 char *checkpoint_target = NULL;
9131 if (dump_opt['k']) {
9132 checkpoint_pool = import_checkpointed_state(target, cfg,
9133 &checkpoint_target);
9134
9135 if (checkpoint_target != NULL)
9136 target = checkpoint_target;
9137 }
9138
9139 if (cfg != NULL) {
9140 nvlist_free(cfg);
9141 cfg = NULL;
9142 }
9143
9144 if (target_pool != target)
9145 free(target_pool);
9146
9147 if (error == 0) {
9148 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
9149 ASSERT(checkpoint_pool != NULL);
9150 ASSERT(checkpoint_target == NULL);
9151
9152 error = spa_open(checkpoint_pool, &spa, FTAG);
9153 if (error != 0) {
9154 fatal("Tried to open pool \"%s\" but "
9155 "spa_open() failed with error %d\n",
9156 checkpoint_pool, error);
9157 }
9158
9159 } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
9160 objset_id == 0) {
9161 zdb_set_skip_mmp(target);
9162 error = spa_open_rewind(target, &spa, FTAG, policy,
9163 NULL);
9164 if (error) {
9165 /*
9166 * If we're missing the log device then
9167 * try opening the pool after clearing the
9168 * log state.
9169 */
9170 mutex_enter(&spa_namespace_lock);
9171 if ((spa = spa_lookup(target)) != NULL &&
9172 spa->spa_log_state == SPA_LOG_MISSING) {
9173 spa->spa_log_state = SPA_LOG_CLEAR;
9174 error = 0;
9175 }
9176 mutex_exit(&spa_namespace_lock);
9177
9178 if (!error) {
9179 error = spa_open_rewind(target, &spa,
9180 FTAG, policy, NULL);
9181 }
9182 }
9183 } else if (strpbrk(target, "#") != NULL) {
9184 dsl_pool_t *dp;
9185 error = dsl_pool_hold(target, FTAG, &dp);
9186 if (error != 0) {
9187 fatal("can't dump '%s': %s", target,
9188 strerror(error));
9189 }
9190 error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
9191 dsl_pool_rele(dp, FTAG);
9192 if (error != 0) {
9193 fatal("can't dump '%s': %s", target,
9194 strerror(error));
9195 }
9196 return (error);
9197 } else {
9198 target_pool = strdup(target);
9199 if (strpbrk(target, "/@") != NULL)
9200 *strpbrk(target_pool, "/@") = '\0';
9201
9202 zdb_set_skip_mmp(target);
9203 /*
9204 * If -N was supplied, the user has indicated that
9205 * zdb -d <pool>/<objsetID> is in effect. Otherwise
9206 * we first assume that the dataset string is the
9207 * dataset name. If dmu_objset_hold fails with the
9208 * dataset string, and we have an objset_id, retry the
9209 * lookup with the objsetID.
9210 */
9211 boolean_t retry = B_TRUE;
9212 retry_lookup:
9213 if (dataset_lookup == B_TRUE) {
9214 /*
9215 * Use the supplied id to get the name
9216 * for open_objset.
9217 */
9218 error = spa_open(target_pool, &spa, FTAG);
9219 if (error == 0) {
9220 error = name_from_objset_id(spa,
9221 objset_id, dsname);
9222 spa_close(spa, FTAG);
9223 if (error == 0)
9224 target = dsname;
9225 }
9226 }
9227 if (error == 0) {
9228 if (objset_id > 0 && retry) {
9229 int err = dmu_objset_hold(target, FTAG,
9230 &os);
9231 if (err) {
9232 dataset_lookup = B_TRUE;
9233 retry = B_FALSE;
9234 goto retry_lookup;
9235 } else {
9236 dmu_objset_rele(os, FTAG);
9237 }
9238 }
9239 error = open_objset(target, FTAG, &os);
9240 }
9241 if (error == 0)
9242 spa = dmu_objset_spa(os);
9243 free(target_pool);
9244 }
9245 }
9246 nvlist_free(policy);
9247
9248 if (error)
9249 fatal("can't open '%s': %s", target, strerror(error));
9250
9251 /*
9252 * Set the pool failure mode to panic in order to prevent the pool
9253 * from suspending. A suspended I/O will have no way to resume and
9254 * can prevent the zdb(8) command from terminating as expected.
9255 */
9256 if (spa != NULL)
9257 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
9258
9259 argv++;
9260 argc--;
9261 if (dump_opt['r']) {
9262 error = zdb_copy_object(os, object, argv[1]);
9263 } else if (!dump_opt['R']) {
9264 flagbits['d'] = ZOR_FLAG_DIRECTORY;
9265 flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
9266 flagbits['m'] = ZOR_FLAG_SPACE_MAP;
9267 flagbits['z'] = ZOR_FLAG_ZAP;
9268 flagbits['A'] = ZOR_FLAG_ALL_TYPES;
9269
9270 if (argc > 0 && dump_opt['d']) {
9271 zopt_object_args = argc;
9272 zopt_object_ranges = calloc(zopt_object_args,
9273 sizeof (zopt_object_range_t));
9274 for (unsigned i = 0; i < zopt_object_args; i++) {
9275 int err;
9276 const char *msg = NULL;
9277
9278 err = parse_object_range(argv[i],
9279 &zopt_object_ranges[i], &msg);
9280 if (err != 0)
9281 fatal("Bad object or range: '%s': %s\n",
9282 argv[i], msg ?: "");
9283 }
9284 } else if (argc > 0 && dump_opt['m']) {
9285 zopt_metaslab_args = argc;
9286 zopt_metaslab = calloc(zopt_metaslab_args,
9287 sizeof (uint64_t));
9288 for (unsigned i = 0; i < zopt_metaslab_args; i++) {
9289 errno = 0;
9290 zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
9291 if (zopt_metaslab[i] == 0 && errno != 0)
9292 fatal("bad number %s: %s", argv[i],
9293 strerror(errno));
9294 }
9295 }
9296 if (dump_opt['B']) {
9297 dump_backup(target, objset_id,
9298 argc > 0 ? argv[0] : NULL);
9299 } else if (os != NULL) {
9300 dump_objset(os);
9301 } else if (zopt_object_args > 0 && !dump_opt['m']) {
9302 dump_objset(spa->spa_meta_objset);
9303 } else {
9304 dump_zpool(spa);
9305 }
9306 } else {
9307 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
9308 flagbits['c'] = ZDB_FLAG_CHECKSUM;
9309 flagbits['d'] = ZDB_FLAG_DECOMPRESS;
9310 flagbits['e'] = ZDB_FLAG_BSWAP;
9311 flagbits['g'] = ZDB_FLAG_GBH;
9312 flagbits['i'] = ZDB_FLAG_INDIRECT;
9313 flagbits['r'] = ZDB_FLAG_RAW;
9314 flagbits['v'] = ZDB_FLAG_VERBOSE;
9315
9316 for (int i = 0; i < argc; i++)
9317 zdb_read_block(argv[i], spa);
9318 }
9319
9320 if (dump_opt['k']) {
9321 free(checkpoint_pool);
9322 if (!target_is_spa)
9323 free(checkpoint_target);
9324 }
9325
9326 if (os != NULL) {
9327 close_objset(os, FTAG);
9328 } else {
9329 spa_close(spa, FTAG);
9330 }
9331
9332 fuid_table_destroy();
9333
9334 dump_debug_buffer();
9335
9336 kernel_fini();
9337
9338 return (error);
9339 }