]> git.proxmox.com Git - mirror_zfs.git/blob - cmd/zdb/zdb.c
Add zfs_file_* interface, remove vnodes
[mirror_zfs.git] / cmd / zdb / zdb.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2016 Nexenta Systems, Inc.
27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
28 * Copyright (c) 2015, 2017, Intel Corporation.
29 */
30
31 #include <stdio.h>
32 #include <unistd.h>
33 #include <stdlib.h>
34 #include <ctype.h>
35 #include <sys/zfs_context.h>
36 #include <sys/spa.h>
37 #include <sys/spa_impl.h>
38 #include <sys/dmu.h>
39 #include <sys/zap.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/zfs_znode.h>
42 #include <sys/zfs_sa.h>
43 #include <sys/sa.h>
44 #include <sys/sa_impl.h>
45 #include <sys/vdev.h>
46 #include <sys/vdev_impl.h>
47 #include <sys/metaslab_impl.h>
48 #include <sys/dmu_objset.h>
49 #include <sys/dsl_dir.h>
50 #include <sys/dsl_dataset.h>
51 #include <sys/dsl_pool.h>
52 #include <sys/dsl_bookmark.h>
53 #include <sys/dbuf.h>
54 #include <sys/zil.h>
55 #include <sys/zil_impl.h>
56 #include <sys/stat.h>
57 #include <sys/resource.h>
58 #include <sys/dmu_send.h>
59 #include <sys/dmu_traverse.h>
60 #include <sys/zio_checksum.h>
61 #include <sys/zio_compress.h>
62 #include <sys/zfs_fuid.h>
63 #include <sys/arc.h>
64 #include <sys/ddt.h>
65 #include <sys/zfeature.h>
66 #include <sys/abd.h>
67 #include <sys/blkptr.h>
68 #include <sys/dsl_crypt.h>
69 #include <sys/dsl_scan.h>
70 #include <zfs_comutil.h>
71
72 #include <libnvpair.h>
73 #include <libzutil.h>
74
75 #include "zdb.h"
76
77 #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
78 zio_compress_table[(idx)].ci_name : "UNKNOWN")
79 #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
80 zio_checksum_table[(idx)].ci_name : "UNKNOWN")
81 #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
82 (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \
83 DMU_OT_ZAP_OTHER : \
84 (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
85 DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
86
87 static char *
88 zdb_ot_name(dmu_object_type_t type)
89 {
90 if (type < DMU_OT_NUMTYPES)
91 return (dmu_ot[type].ot_name);
92 else if ((type & DMU_OT_NEWTYPE) &&
93 ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS))
94 return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name);
95 else
96 return ("UNKNOWN");
97 }
98
99 extern int reference_tracking_enable;
100 extern int zfs_recover;
101 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
102 extern int zfs_vdev_async_read_max_active;
103 extern boolean_t spa_load_verify_dryrun;
104 extern int zfs_reconstruct_indirect_combinations_max;
105 extern int zfs_btree_verify_intensity;
106
107 static const char cmdname[] = "zdb";
108 uint8_t dump_opt[256];
109
110 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
111
112 uint64_t *zopt_object = NULL;
113 static unsigned zopt_objects = 0;
114 uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
115 static int leaked_objects = 0;
116 static range_tree_t *mos_refd_objs;
117
118 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
119 boolean_t);
120 static void mos_obj_refd(uint64_t);
121 static void mos_obj_refd_multiple(uint64_t);
122
123 /*
124 * These libumem hooks provide a reasonable set of defaults for the allocator's
125 * debugging facilities.
126 */
127 const char *
128 _umem_debug_init(void)
129 {
130 return ("default,verbose"); /* $UMEM_DEBUG setting */
131 }
132
133 const char *
134 _umem_logging_init(void)
135 {
136 return ("fail,contents"); /* $UMEM_LOGGING setting */
137 }
138
139 static void
140 usage(void)
141 {
142 (void) fprintf(stderr,
143 "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
144 "[-I <inflight I/Os>]\n"
145 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
146 "\t\t[<poolname> [<object> ...]]\n"
147 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset>\n"
148 "\t\t[<object> ...]\n"
149 "\t%s [-v] <bookmark>\n"
150 "\t%s -C [-A] [-U <cache>]\n"
151 "\t%s -l [-Aqu] <device>\n"
152 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
153 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
154 "\t%s -O <dataset> <path>\n"
155 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
156 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
157 "\t%s -E [-A] word0:word1:...:word15\n"
158 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
159 "<poolname>\n\n",
160 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
161 cmdname, cmdname, cmdname);
162
163 (void) fprintf(stderr, " Dataset name must include at least one "
164 "separator character '/' or '@'\n");
165 (void) fprintf(stderr, " If dataset name is specified, only that "
166 "dataset is dumped\n");
167 (void) fprintf(stderr, " If object numbers are specified, only "
168 "those objects are dumped\n\n");
169 (void) fprintf(stderr, " Options to control amount of output:\n");
170 (void) fprintf(stderr, " -b block statistics\n");
171 (void) fprintf(stderr, " -c checksum all metadata (twice for "
172 "all data) blocks\n");
173 (void) fprintf(stderr, " -C config (or cachefile if alone)\n");
174 (void) fprintf(stderr, " -d dataset(s)\n");
175 (void) fprintf(stderr, " -D dedup statistics\n");
176 (void) fprintf(stderr, " -E decode and display block from an "
177 "embedded block pointer\n");
178 (void) fprintf(stderr, " -h pool history\n");
179 (void) fprintf(stderr, " -i intent logs\n");
180 (void) fprintf(stderr, " -l read label contents\n");
181 (void) fprintf(stderr, " -k examine the checkpointed state "
182 "of the pool\n");
183 (void) fprintf(stderr, " -L disable leak tracking (do not "
184 "load spacemaps)\n");
185 (void) fprintf(stderr, " -m metaslabs\n");
186 (void) fprintf(stderr, " -M metaslab groups\n");
187 (void) fprintf(stderr, " -O perform object lookups by path\n");
188 (void) fprintf(stderr, " -R read and display block from a "
189 "device\n");
190 (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
191 (void) fprintf(stderr, " -S simulate dedup to measure effect\n");
192 (void) fprintf(stderr, " -v verbose (applies to all "
193 "others)\n\n");
194 (void) fprintf(stderr, " Below options are intended for use "
195 "with other options:\n");
196 (void) fprintf(stderr, " -A ignore assertions (-A), enable "
197 "panic recovery (-AA) or both (-AAA)\n");
198 (void) fprintf(stderr, " -e pool is exported/destroyed/"
199 "has altroot/not in a cachefile\n");
200 (void) fprintf(stderr, " -F attempt automatic rewind within "
201 "safe range of transaction groups\n");
202 (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
203 "exiting\n");
204 (void) fprintf(stderr, " -I <number of inflight I/Os> -- "
205 "specify the maximum number of\n "
206 "checksumming I/Os [default is 200]\n");
207 (void) fprintf(stderr, " -o <variable>=<value> set global "
208 "variable to an unsigned 32-bit integer\n");
209 (void) fprintf(stderr, " -p <path> -- use one or more with "
210 "-e to specify path to vdev dir\n");
211 (void) fprintf(stderr, " -P print numbers in parseable form\n");
212 (void) fprintf(stderr, " -q don't print label contents\n");
213 (void) fprintf(stderr, " -t <txg> -- highest txg to use when "
214 "searching for uberblocks\n");
215 (void) fprintf(stderr, " -u uberblock\n");
216 (void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
217 "cachefile\n");
218 (void) fprintf(stderr, " -V do verbatim import\n");
219 (void) fprintf(stderr, " -x <dumpdir> -- "
220 "dump all read blocks into specified directory\n");
221 (void) fprintf(stderr, " -X attempt extreme rewind (does not "
222 "work with dataset)\n");
223 (void) fprintf(stderr, " -Y attempt all reconstruction "
224 "combinations for split blocks\n");
225 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
226 "to make only that option verbose\n");
227 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
228 exit(1);
229 }
230
231 static void
232 dump_debug_buffer(void)
233 {
234 if (dump_opt['G']) {
235 (void) printf("\n");
236 (void) fflush(stdout);
237 zfs_dbgmsg_print("zdb");
238 }
239 }
240
241 /*
242 * Called for usage errors that are discovered after a call to spa_open(),
243 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
244 */
245
246 static void
247 fatal(const char *fmt, ...)
248 {
249 va_list ap;
250
251 va_start(ap, fmt);
252 (void) fprintf(stderr, "%s: ", cmdname);
253 (void) vfprintf(stderr, fmt, ap);
254 va_end(ap);
255 (void) fprintf(stderr, "\n");
256
257 dump_debug_buffer();
258
259 exit(1);
260 }
261
262 /* ARGSUSED */
263 static void
264 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
265 {
266 nvlist_t *nv;
267 size_t nvsize = *(uint64_t *)data;
268 char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
269
270 VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
271
272 VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
273
274 umem_free(packed, nvsize);
275
276 dump_nvlist(nv, 8);
277
278 nvlist_free(nv);
279 }
280
281 /* ARGSUSED */
282 static void
283 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
284 {
285 spa_history_phys_t *shp = data;
286
287 if (shp == NULL)
288 return;
289
290 (void) printf("\t\tpool_create_len = %llu\n",
291 (u_longlong_t)shp->sh_pool_create_len);
292 (void) printf("\t\tphys_max_off = %llu\n",
293 (u_longlong_t)shp->sh_phys_max_off);
294 (void) printf("\t\tbof = %llu\n",
295 (u_longlong_t)shp->sh_bof);
296 (void) printf("\t\teof = %llu\n",
297 (u_longlong_t)shp->sh_eof);
298 (void) printf("\t\trecords_lost = %llu\n",
299 (u_longlong_t)shp->sh_records_lost);
300 }
301
302 static void
303 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
304 {
305 if (dump_opt['P'])
306 (void) snprintf(buf, buflen, "%llu", (longlong_t)num);
307 else
308 nicenum(num, buf, sizeof (buf));
309 }
310
311 static const char histo_stars[] = "****************************************";
312 static const uint64_t histo_width = sizeof (histo_stars) - 1;
313
314 static void
315 dump_histogram(const uint64_t *histo, int size, int offset)
316 {
317 int i;
318 int minidx = size - 1;
319 int maxidx = 0;
320 uint64_t max = 0;
321
322 for (i = 0; i < size; i++) {
323 if (histo[i] > max)
324 max = histo[i];
325 if (histo[i] > 0 && i > maxidx)
326 maxidx = i;
327 if (histo[i] > 0 && i < minidx)
328 minidx = i;
329 }
330
331 if (max < histo_width)
332 max = histo_width;
333
334 for (i = minidx; i <= maxidx; i++) {
335 (void) printf("\t\t\t%3u: %6llu %s\n",
336 i + offset, (u_longlong_t)histo[i],
337 &histo_stars[(max - histo[i]) * histo_width / max]);
338 }
339 }
340
341 static void
342 dump_zap_stats(objset_t *os, uint64_t object)
343 {
344 int error;
345 zap_stats_t zs;
346
347 error = zap_get_stats(os, object, &zs);
348 if (error)
349 return;
350
351 if (zs.zs_ptrtbl_len == 0) {
352 ASSERT(zs.zs_num_blocks == 1);
353 (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
354 (u_longlong_t)zs.zs_blocksize,
355 (u_longlong_t)zs.zs_num_entries);
356 return;
357 }
358
359 (void) printf("\tFat ZAP stats:\n");
360
361 (void) printf("\t\tPointer table:\n");
362 (void) printf("\t\t\t%llu elements\n",
363 (u_longlong_t)zs.zs_ptrtbl_len);
364 (void) printf("\t\t\tzt_blk: %llu\n",
365 (u_longlong_t)zs.zs_ptrtbl_zt_blk);
366 (void) printf("\t\t\tzt_numblks: %llu\n",
367 (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
368 (void) printf("\t\t\tzt_shift: %llu\n",
369 (u_longlong_t)zs.zs_ptrtbl_zt_shift);
370 (void) printf("\t\t\tzt_blks_copied: %llu\n",
371 (u_longlong_t)zs.zs_ptrtbl_blks_copied);
372 (void) printf("\t\t\tzt_nextblk: %llu\n",
373 (u_longlong_t)zs.zs_ptrtbl_nextblk);
374
375 (void) printf("\t\tZAP entries: %llu\n",
376 (u_longlong_t)zs.zs_num_entries);
377 (void) printf("\t\tLeaf blocks: %llu\n",
378 (u_longlong_t)zs.zs_num_leafs);
379 (void) printf("\t\tTotal blocks: %llu\n",
380 (u_longlong_t)zs.zs_num_blocks);
381 (void) printf("\t\tzap_block_type: 0x%llx\n",
382 (u_longlong_t)zs.zs_block_type);
383 (void) printf("\t\tzap_magic: 0x%llx\n",
384 (u_longlong_t)zs.zs_magic);
385 (void) printf("\t\tzap_salt: 0x%llx\n",
386 (u_longlong_t)zs.zs_salt);
387
388 (void) printf("\t\tLeafs with 2^n pointers:\n");
389 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
390
391 (void) printf("\t\tBlocks with n*5 entries:\n");
392 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
393
394 (void) printf("\t\tBlocks n/10 full:\n");
395 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
396
397 (void) printf("\t\tEntries with n chunks:\n");
398 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
399
400 (void) printf("\t\tBuckets with n entries:\n");
401 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
402 }
403
404 /*ARGSUSED*/
405 static void
406 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
407 {
408 }
409
410 /*ARGSUSED*/
411 static void
412 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
413 {
414 (void) printf("\tUNKNOWN OBJECT TYPE\n");
415 }
416
417 /*ARGSUSED*/
418 static void
419 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
420 {
421 }
422
423 /*ARGSUSED*/
424 static void
425 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
426 {
427 uint64_t *arr;
428 uint64_t oursize;
429 if (dump_opt['d'] < 6)
430 return;
431
432 if (data == NULL) {
433 dmu_object_info_t doi;
434
435 VERIFY0(dmu_object_info(os, object, &doi));
436 size = doi.doi_max_offset;
437 /*
438 * We cap the size at 1 mebibyte here to prevent
439 * allocation failures and nigh-infinite printing if the
440 * object is extremely large.
441 */
442 oursize = MIN(size, 1 << 20);
443 arr = kmem_alloc(oursize, KM_SLEEP);
444
445 int err = dmu_read(os, object, 0, oursize, arr, 0);
446 if (err != 0) {
447 (void) printf("got error %u from dmu_read\n", err);
448 kmem_free(arr, oursize);
449 return;
450 }
451 } else {
452 /*
453 * Even though the allocation is already done in this code path,
454 * we still cap the size to prevent excessive printing.
455 */
456 oursize = MIN(size, 1 << 20);
457 arr = data;
458 }
459
460 if (size == 0) {
461 (void) printf("\t\t[]\n");
462 return;
463 }
464
465 (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
466 for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
467 if (i % 4 != 0)
468 (void) printf(", %0llx", (u_longlong_t)arr[i]);
469 else
470 (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
471 }
472 if (oursize != size)
473 (void) printf(", ... ");
474 (void) printf("]\n");
475
476 if (data == NULL)
477 kmem_free(arr, oursize);
478 }
479
480 /*ARGSUSED*/
481 static void
482 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
483 {
484 zap_cursor_t zc;
485 zap_attribute_t attr;
486 void *prop;
487 unsigned i;
488
489 dump_zap_stats(os, object);
490 (void) printf("\n");
491
492 for (zap_cursor_init(&zc, os, object);
493 zap_cursor_retrieve(&zc, &attr) == 0;
494 zap_cursor_advance(&zc)) {
495 (void) printf("\t\t%s = ", attr.za_name);
496 if (attr.za_num_integers == 0) {
497 (void) printf("\n");
498 continue;
499 }
500 prop = umem_zalloc(attr.za_num_integers *
501 attr.za_integer_length, UMEM_NOFAIL);
502 (void) zap_lookup(os, object, attr.za_name,
503 attr.za_integer_length, attr.za_num_integers, prop);
504 if (attr.za_integer_length == 1) {
505 (void) printf("%s", (char *)prop);
506 } else {
507 for (i = 0; i < attr.za_num_integers; i++) {
508 switch (attr.za_integer_length) {
509 case 2:
510 (void) printf("%u ",
511 ((uint16_t *)prop)[i]);
512 break;
513 case 4:
514 (void) printf("%u ",
515 ((uint32_t *)prop)[i]);
516 break;
517 case 8:
518 (void) printf("%lld ",
519 (u_longlong_t)((int64_t *)prop)[i]);
520 break;
521 }
522 }
523 }
524 (void) printf("\n");
525 umem_free(prop, attr.za_num_integers * attr.za_integer_length);
526 }
527 zap_cursor_fini(&zc);
528 }
529
530 static void
531 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
532 {
533 bpobj_phys_t *bpop = data;
534 uint64_t i;
535 char bytes[32], comp[32], uncomp[32];
536
537 /* make sure the output won't get truncated */
538 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
539 CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
540 CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
541
542 if (bpop == NULL)
543 return;
544
545 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
546 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
547 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
548
549 (void) printf("\t\tnum_blkptrs = %llu\n",
550 (u_longlong_t)bpop->bpo_num_blkptrs);
551 (void) printf("\t\tbytes = %s\n", bytes);
552 if (size >= BPOBJ_SIZE_V1) {
553 (void) printf("\t\tcomp = %s\n", comp);
554 (void) printf("\t\tuncomp = %s\n", uncomp);
555 }
556 if (size >= BPOBJ_SIZE_V2) {
557 (void) printf("\t\tsubobjs = %llu\n",
558 (u_longlong_t)bpop->bpo_subobjs);
559 (void) printf("\t\tnum_subobjs = %llu\n",
560 (u_longlong_t)bpop->bpo_num_subobjs);
561 }
562 if (size >= sizeof (*bpop)) {
563 (void) printf("\t\tnum_freed = %llu\n",
564 (u_longlong_t)bpop->bpo_num_freed);
565 }
566
567 if (dump_opt['d'] < 5)
568 return;
569
570 for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
571 char blkbuf[BP_SPRINTF_LEN];
572 blkptr_t bp;
573
574 int err = dmu_read(os, object,
575 i * sizeof (bp), sizeof (bp), &bp, 0);
576 if (err != 0) {
577 (void) printf("got error %u from dmu_read\n", err);
578 break;
579 }
580 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
581 BP_GET_FREE(&bp));
582 (void) printf("\t%s\n", blkbuf);
583 }
584 }
585
586 /* ARGSUSED */
587 static void
588 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
589 {
590 dmu_object_info_t doi;
591 int64_t i;
592
593 VERIFY0(dmu_object_info(os, object, &doi));
594 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
595
596 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
597 if (err != 0) {
598 (void) printf("got error %u from dmu_read\n", err);
599 kmem_free(subobjs, doi.doi_max_offset);
600 return;
601 }
602
603 int64_t last_nonzero = -1;
604 for (i = 0; i < doi.doi_max_offset / 8; i++) {
605 if (subobjs[i] != 0)
606 last_nonzero = i;
607 }
608
609 for (i = 0; i <= last_nonzero; i++) {
610 (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
611 }
612 kmem_free(subobjs, doi.doi_max_offset);
613 }
614
615 /*ARGSUSED*/
616 static void
617 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
618 {
619 dump_zap_stats(os, object);
620 /* contents are printed elsewhere, properly decoded */
621 }
622
623 /*ARGSUSED*/
624 static void
625 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
626 {
627 zap_cursor_t zc;
628 zap_attribute_t attr;
629
630 dump_zap_stats(os, object);
631 (void) printf("\n");
632
633 for (zap_cursor_init(&zc, os, object);
634 zap_cursor_retrieve(&zc, &attr) == 0;
635 zap_cursor_advance(&zc)) {
636 (void) printf("\t\t%s = ", attr.za_name);
637 if (attr.za_num_integers == 0) {
638 (void) printf("\n");
639 continue;
640 }
641 (void) printf(" %llx : [%d:%d:%d]\n",
642 (u_longlong_t)attr.za_first_integer,
643 (int)ATTR_LENGTH(attr.za_first_integer),
644 (int)ATTR_BSWAP(attr.za_first_integer),
645 (int)ATTR_NUM(attr.za_first_integer));
646 }
647 zap_cursor_fini(&zc);
648 }
649
650 /*ARGSUSED*/
651 static void
652 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
653 {
654 zap_cursor_t zc;
655 zap_attribute_t attr;
656 uint16_t *layout_attrs;
657 unsigned i;
658
659 dump_zap_stats(os, object);
660 (void) printf("\n");
661
662 for (zap_cursor_init(&zc, os, object);
663 zap_cursor_retrieve(&zc, &attr) == 0;
664 zap_cursor_advance(&zc)) {
665 (void) printf("\t\t%s = [", attr.za_name);
666 if (attr.za_num_integers == 0) {
667 (void) printf("\n");
668 continue;
669 }
670
671 VERIFY(attr.za_integer_length == 2);
672 layout_attrs = umem_zalloc(attr.za_num_integers *
673 attr.za_integer_length, UMEM_NOFAIL);
674
675 VERIFY(zap_lookup(os, object, attr.za_name,
676 attr.za_integer_length,
677 attr.za_num_integers, layout_attrs) == 0);
678
679 for (i = 0; i != attr.za_num_integers; i++)
680 (void) printf(" %d ", (int)layout_attrs[i]);
681 (void) printf("]\n");
682 umem_free(layout_attrs,
683 attr.za_num_integers * attr.za_integer_length);
684 }
685 zap_cursor_fini(&zc);
686 }
687
688 /*ARGSUSED*/
689 static void
690 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
691 {
692 zap_cursor_t zc;
693 zap_attribute_t attr;
694 const char *typenames[] = {
695 /* 0 */ "not specified",
696 /* 1 */ "FIFO",
697 /* 2 */ "Character Device",
698 /* 3 */ "3 (invalid)",
699 /* 4 */ "Directory",
700 /* 5 */ "5 (invalid)",
701 /* 6 */ "Block Device",
702 /* 7 */ "7 (invalid)",
703 /* 8 */ "Regular File",
704 /* 9 */ "9 (invalid)",
705 /* 10 */ "Symbolic Link",
706 /* 11 */ "11 (invalid)",
707 /* 12 */ "Socket",
708 /* 13 */ "Door",
709 /* 14 */ "Event Port",
710 /* 15 */ "15 (invalid)",
711 };
712
713 dump_zap_stats(os, object);
714 (void) printf("\n");
715
716 for (zap_cursor_init(&zc, os, object);
717 zap_cursor_retrieve(&zc, &attr) == 0;
718 zap_cursor_advance(&zc)) {
719 (void) printf("\t\t%s = %lld (type: %s)\n",
720 attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
721 typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
722 }
723 zap_cursor_fini(&zc);
724 }
725
726 static int
727 get_dtl_refcount(vdev_t *vd)
728 {
729 int refcount = 0;
730
731 if (vd->vdev_ops->vdev_op_leaf) {
732 space_map_t *sm = vd->vdev_dtl_sm;
733
734 if (sm != NULL &&
735 sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
736 return (1);
737 return (0);
738 }
739
740 for (unsigned c = 0; c < vd->vdev_children; c++)
741 refcount += get_dtl_refcount(vd->vdev_child[c]);
742 return (refcount);
743 }
744
745 static int
746 get_metaslab_refcount(vdev_t *vd)
747 {
748 int refcount = 0;
749
750 if (vd->vdev_top == vd) {
751 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
752 space_map_t *sm = vd->vdev_ms[m]->ms_sm;
753
754 if (sm != NULL &&
755 sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
756 refcount++;
757 }
758 }
759 for (unsigned c = 0; c < vd->vdev_children; c++)
760 refcount += get_metaslab_refcount(vd->vdev_child[c]);
761
762 return (refcount);
763 }
764
765 static int
766 get_obsolete_refcount(vdev_t *vd)
767 {
768 uint64_t obsolete_sm_object;
769 int refcount = 0;
770
771 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
772 if (vd->vdev_top == vd && obsolete_sm_object != 0) {
773 dmu_object_info_t doi;
774 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
775 obsolete_sm_object, &doi));
776 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
777 refcount++;
778 }
779 } else {
780 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
781 ASSERT3U(obsolete_sm_object, ==, 0);
782 }
783 for (unsigned c = 0; c < vd->vdev_children; c++) {
784 refcount += get_obsolete_refcount(vd->vdev_child[c]);
785 }
786
787 return (refcount);
788 }
789
790 static int
791 get_prev_obsolete_spacemap_refcount(spa_t *spa)
792 {
793 uint64_t prev_obj =
794 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
795 if (prev_obj != 0) {
796 dmu_object_info_t doi;
797 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
798 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
799 return (1);
800 }
801 }
802 return (0);
803 }
804
805 static int
806 get_checkpoint_refcount(vdev_t *vd)
807 {
808 int refcount = 0;
809
810 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
811 zap_contains(spa_meta_objset(vd->vdev_spa),
812 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
813 refcount++;
814
815 for (uint64_t c = 0; c < vd->vdev_children; c++)
816 refcount += get_checkpoint_refcount(vd->vdev_child[c]);
817
818 return (refcount);
819 }
820
821 static int
822 get_log_spacemap_refcount(spa_t *spa)
823 {
824 return (avl_numnodes(&spa->spa_sm_logs_by_txg));
825 }
826
827 static int
828 verify_spacemap_refcounts(spa_t *spa)
829 {
830 uint64_t expected_refcount = 0;
831 uint64_t actual_refcount;
832
833 (void) feature_get_refcount(spa,
834 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
835 &expected_refcount);
836 actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
837 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
838 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
839 actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
840 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
841 actual_refcount += get_log_spacemap_refcount(spa);
842
843 if (expected_refcount != actual_refcount) {
844 (void) printf("space map refcount mismatch: expected %lld != "
845 "actual %lld\n",
846 (longlong_t)expected_refcount,
847 (longlong_t)actual_refcount);
848 return (2);
849 }
850 return (0);
851 }
852
853 static void
854 dump_spacemap(objset_t *os, space_map_t *sm)
855 {
856 const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
857 "INVALID", "INVALID", "INVALID", "INVALID" };
858
859 if (sm == NULL)
860 return;
861
862 (void) printf("space map object %llu:\n",
863 (longlong_t)sm->sm_object);
864 (void) printf(" smp_length = 0x%llx\n",
865 (longlong_t)sm->sm_phys->smp_length);
866 (void) printf(" smp_alloc = 0x%llx\n",
867 (longlong_t)sm->sm_phys->smp_alloc);
868
869 if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
870 return;
871
872 /*
873 * Print out the freelist entries in both encoded and decoded form.
874 */
875 uint8_t mapshift = sm->sm_shift;
876 int64_t alloc = 0;
877 uint64_t word, entry_id = 0;
878 for (uint64_t offset = 0; offset < space_map_length(sm);
879 offset += sizeof (word)) {
880
881 VERIFY0(dmu_read(os, space_map_object(sm), offset,
882 sizeof (word), &word, DMU_READ_PREFETCH));
883
884 if (sm_entry_is_debug(word)) {
885 (void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
886 (u_longlong_t)entry_id,
887 ddata[SM_DEBUG_ACTION_DECODE(word)],
888 (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
889 (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
890 entry_id++;
891 continue;
892 }
893
894 uint8_t words;
895 char entry_type;
896 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
897
898 if (sm_entry_is_single_word(word)) {
899 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
900 'A' : 'F';
901 entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
902 sm->sm_start;
903 entry_run = SM_RUN_DECODE(word) << mapshift;
904 words = 1;
905 } else {
906 /* it is a two-word entry so we read another word */
907 ASSERT(sm_entry_is_double_word(word));
908
909 uint64_t extra_word;
910 offset += sizeof (extra_word);
911 VERIFY0(dmu_read(os, space_map_object(sm), offset,
912 sizeof (extra_word), &extra_word,
913 DMU_READ_PREFETCH));
914
915 ASSERT3U(offset, <=, space_map_length(sm));
916
917 entry_run = SM2_RUN_DECODE(word) << mapshift;
918 entry_vdev = SM2_VDEV_DECODE(word);
919 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
920 'A' : 'F';
921 entry_off = (SM2_OFFSET_DECODE(extra_word) <<
922 mapshift) + sm->sm_start;
923 words = 2;
924 }
925
926 (void) printf("\t [%6llu] %c range:"
927 " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
928 (u_longlong_t)entry_id,
929 entry_type, (u_longlong_t)entry_off,
930 (u_longlong_t)(entry_off + entry_run),
931 (u_longlong_t)entry_run,
932 (u_longlong_t)entry_vdev, words);
933
934 if (entry_type == 'A')
935 alloc += entry_run;
936 else
937 alloc -= entry_run;
938 entry_id++;
939 }
940 if (alloc != space_map_allocated(sm)) {
941 (void) printf("space_map_object alloc (%lld) INCONSISTENT "
942 "with space map summary (%lld)\n",
943 (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
944 }
945 }
946
947 static void
948 dump_metaslab_stats(metaslab_t *msp)
949 {
950 char maxbuf[32];
951 range_tree_t *rt = msp->ms_allocatable;
952 zfs_btree_t *t = &msp->ms_allocatable_by_size;
953 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
954
955 /* max sure nicenum has enough space */
956 CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
957
958 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
959
960 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
961 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
962 "freepct", free_pct);
963 (void) printf("\tIn-memory histogram:\n");
964 dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
965 }
966
967 static void
968 dump_metaslab(metaslab_t *msp)
969 {
970 vdev_t *vd = msp->ms_group->mg_vd;
971 spa_t *spa = vd->vdev_spa;
972 space_map_t *sm = msp->ms_sm;
973 char freebuf[32];
974
975 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
976 sizeof (freebuf));
977
978 (void) printf(
979 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
980 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
981 (u_longlong_t)space_map_object(sm), freebuf);
982
983 if (dump_opt['m'] > 2 && !dump_opt['L']) {
984 mutex_enter(&msp->ms_lock);
985 VERIFY0(metaslab_load(msp));
986 range_tree_stat_verify(msp->ms_allocatable);
987 dump_metaslab_stats(msp);
988 metaslab_unload(msp);
989 mutex_exit(&msp->ms_lock);
990 }
991
992 if (dump_opt['m'] > 1 && sm != NULL &&
993 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
994 /*
995 * The space map histogram represents free space in chunks
996 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
997 */
998 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
999 (u_longlong_t)msp->ms_fragmentation);
1000 dump_histogram(sm->sm_phys->smp_histogram,
1001 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
1002 }
1003
1004 ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
1005 dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
1006
1007 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
1008 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
1009 (u_longlong_t)metaslab_unflushed_txg(msp));
1010 }
1011 }
1012
1013 static void
1014 print_vdev_metaslab_header(vdev_t *vd)
1015 {
1016 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
1017 const char *bias_str = "";
1018 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
1019 bias_str = VDEV_ALLOC_BIAS_LOG;
1020 } else if (alloc_bias == VDEV_BIAS_SPECIAL) {
1021 bias_str = VDEV_ALLOC_BIAS_SPECIAL;
1022 } else if (alloc_bias == VDEV_BIAS_DEDUP) {
1023 bias_str = VDEV_ALLOC_BIAS_DEDUP;
1024 }
1025
1026 uint64_t ms_flush_data_obj = 0;
1027 if (vd->vdev_top_zap != 0) {
1028 int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
1029 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1030 sizeof (uint64_t), 1, &ms_flush_data_obj);
1031 if (error != ENOENT) {
1032 ASSERT0(error);
1033 }
1034 }
1035
1036 (void) printf("\tvdev %10llu %s",
1037 (u_longlong_t)vd->vdev_id, bias_str);
1038
1039 if (ms_flush_data_obj != 0) {
1040 (void) printf(" ms_unflushed_phys object %llu",
1041 (u_longlong_t)ms_flush_data_obj);
1042 }
1043
1044 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n",
1045 "metaslabs", (u_longlong_t)vd->vdev_ms_count,
1046 "offset", "spacemap", "free");
1047 (void) printf("\t%15s %19s %15s %12s\n",
1048 "---------------", "-------------------",
1049 "---------------", "------------");
1050 }
1051
1052 static void
1053 dump_metaslab_groups(spa_t *spa)
1054 {
1055 vdev_t *rvd = spa->spa_root_vdev;
1056 metaslab_class_t *mc = spa_normal_class(spa);
1057 uint64_t fragmentation;
1058
1059 metaslab_class_histogram_verify(mc);
1060
1061 for (unsigned c = 0; c < rvd->vdev_children; c++) {
1062 vdev_t *tvd = rvd->vdev_child[c];
1063 metaslab_group_t *mg = tvd->vdev_mg;
1064
1065 if (mg == NULL || mg->mg_class != mc)
1066 continue;
1067
1068 metaslab_group_histogram_verify(mg);
1069 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1070
1071 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
1072 "fragmentation",
1073 (u_longlong_t)tvd->vdev_id,
1074 (u_longlong_t)tvd->vdev_ms_count);
1075 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
1076 (void) printf("%3s\n", "-");
1077 } else {
1078 (void) printf("%3llu%%\n",
1079 (u_longlong_t)mg->mg_fragmentation);
1080 }
1081 dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1082 }
1083
1084 (void) printf("\tpool %s\tfragmentation", spa_name(spa));
1085 fragmentation = metaslab_class_fragmentation(mc);
1086 if (fragmentation == ZFS_FRAG_INVALID)
1087 (void) printf("\t%3s\n", "-");
1088 else
1089 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
1090 dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1091 }
1092
1093 static void
1094 print_vdev_indirect(vdev_t *vd)
1095 {
1096 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
1097 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1098 vdev_indirect_births_t *vib = vd->vdev_indirect_births;
1099
1100 if (vim == NULL) {
1101 ASSERT3P(vib, ==, NULL);
1102 return;
1103 }
1104
1105 ASSERT3U(vdev_indirect_mapping_object(vim), ==,
1106 vic->vic_mapping_object);
1107 ASSERT3U(vdev_indirect_births_object(vib), ==,
1108 vic->vic_births_object);
1109
1110 (void) printf("indirect births obj %llu:\n",
1111 (longlong_t)vic->vic_births_object);
1112 (void) printf(" vib_count = %llu\n",
1113 (longlong_t)vdev_indirect_births_count(vib));
1114 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
1115 vdev_indirect_birth_entry_phys_t *cur_vibe =
1116 &vib->vib_entries[i];
1117 (void) printf("\toffset %llx -> txg %llu\n",
1118 (longlong_t)cur_vibe->vibe_offset,
1119 (longlong_t)cur_vibe->vibe_phys_birth_txg);
1120 }
1121 (void) printf("\n");
1122
1123 (void) printf("indirect mapping obj %llu:\n",
1124 (longlong_t)vic->vic_mapping_object);
1125 (void) printf(" vim_max_offset = 0x%llx\n",
1126 (longlong_t)vdev_indirect_mapping_max_offset(vim));
1127 (void) printf(" vim_bytes_mapped = 0x%llx\n",
1128 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
1129 (void) printf(" vim_count = %llu\n",
1130 (longlong_t)vdev_indirect_mapping_num_entries(vim));
1131
1132 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
1133 return;
1134
1135 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
1136
1137 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
1138 vdev_indirect_mapping_entry_phys_t *vimep =
1139 &vim->vim_entries[i];
1140 (void) printf("\t<%llx:%llx:%llx> -> "
1141 "<%llx:%llx:%llx> (%x obsolete)\n",
1142 (longlong_t)vd->vdev_id,
1143 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
1144 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1145 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
1146 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
1147 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1148 counts[i]);
1149 }
1150 (void) printf("\n");
1151
1152 uint64_t obsolete_sm_object;
1153 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1154 if (obsolete_sm_object != 0) {
1155 objset_t *mos = vd->vdev_spa->spa_meta_objset;
1156 (void) printf("obsolete space map object %llu:\n",
1157 (u_longlong_t)obsolete_sm_object);
1158 ASSERT(vd->vdev_obsolete_sm != NULL);
1159 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
1160 obsolete_sm_object);
1161 dump_spacemap(mos, vd->vdev_obsolete_sm);
1162 (void) printf("\n");
1163 }
1164 }
1165
1166 static void
1167 dump_metaslabs(spa_t *spa)
1168 {
1169 vdev_t *vd, *rvd = spa->spa_root_vdev;
1170 uint64_t m, c = 0, children = rvd->vdev_children;
1171
1172 (void) printf("\nMetaslabs:\n");
1173
1174 if (!dump_opt['d'] && zopt_objects > 0) {
1175 c = zopt_object[0];
1176
1177 if (c >= children)
1178 (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
1179
1180 if (zopt_objects > 1) {
1181 vd = rvd->vdev_child[c];
1182 print_vdev_metaslab_header(vd);
1183
1184 for (m = 1; m < zopt_objects; m++) {
1185 if (zopt_object[m] < vd->vdev_ms_count)
1186 dump_metaslab(
1187 vd->vdev_ms[zopt_object[m]]);
1188 else
1189 (void) fprintf(stderr, "bad metaslab "
1190 "number %llu\n",
1191 (u_longlong_t)zopt_object[m]);
1192 }
1193 (void) printf("\n");
1194 return;
1195 }
1196 children = c + 1;
1197 }
1198 for (; c < children; c++) {
1199 vd = rvd->vdev_child[c];
1200 print_vdev_metaslab_header(vd);
1201
1202 print_vdev_indirect(vd);
1203
1204 for (m = 0; m < vd->vdev_ms_count; m++)
1205 dump_metaslab(vd->vdev_ms[m]);
1206 (void) printf("\n");
1207 }
1208 }
1209
1210 static void
1211 dump_log_spacemaps(spa_t *spa)
1212 {
1213 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1214 return;
1215
1216 (void) printf("\nLog Space Maps in Pool:\n");
1217 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1218 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1219 space_map_t *sm = NULL;
1220 VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
1221 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
1222
1223 (void) printf("Log Spacemap object %llu txg %llu\n",
1224 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
1225 dump_spacemap(spa->spa_meta_objset, sm);
1226 space_map_close(sm);
1227 }
1228 (void) printf("\n");
1229 }
1230
1231 static void
1232 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
1233 {
1234 const ddt_phys_t *ddp = dde->dde_phys;
1235 const ddt_key_t *ddk = &dde->dde_key;
1236 const char *types[4] = { "ditto", "single", "double", "triple" };
1237 char blkbuf[BP_SPRINTF_LEN];
1238 blkptr_t blk;
1239 int p;
1240
1241 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1242 if (ddp->ddp_phys_birth == 0)
1243 continue;
1244 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1245 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
1246 (void) printf("index %llx refcnt %llu %s %s\n",
1247 (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
1248 types[p], blkbuf);
1249 }
1250 }
1251
1252 static void
1253 dump_dedup_ratio(const ddt_stat_t *dds)
1254 {
1255 double rL, rP, rD, D, dedup, compress, copies;
1256
1257 if (dds->dds_blocks == 0)
1258 return;
1259
1260 rL = (double)dds->dds_ref_lsize;
1261 rP = (double)dds->dds_ref_psize;
1262 rD = (double)dds->dds_ref_dsize;
1263 D = (double)dds->dds_dsize;
1264
1265 dedup = rD / D;
1266 compress = rL / rP;
1267 copies = rD / rP;
1268
1269 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
1270 "dedup * compress / copies = %.2f\n\n",
1271 dedup, compress, copies, dedup * compress / copies);
1272 }
1273
1274 static void
1275 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
1276 {
1277 char name[DDT_NAMELEN];
1278 ddt_entry_t dde;
1279 uint64_t walk = 0;
1280 dmu_object_info_t doi;
1281 uint64_t count, dspace, mspace;
1282 int error;
1283
1284 error = ddt_object_info(ddt, type, class, &doi);
1285
1286 if (error == ENOENT)
1287 return;
1288 ASSERT(error == 0);
1289
1290 error = ddt_object_count(ddt, type, class, &count);
1291 ASSERT(error == 0);
1292 if (count == 0)
1293 return;
1294
1295 dspace = doi.doi_physical_blocks_512 << 9;
1296 mspace = doi.doi_fill_count * doi.doi_data_block_size;
1297
1298 ddt_object_name(ddt, type, class, name);
1299
1300 (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
1301 name,
1302 (u_longlong_t)count,
1303 (u_longlong_t)(dspace / count),
1304 (u_longlong_t)(mspace / count));
1305
1306 if (dump_opt['D'] < 3)
1307 return;
1308
1309 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
1310
1311 if (dump_opt['D'] < 4)
1312 return;
1313
1314 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
1315 return;
1316
1317 (void) printf("%s contents:\n\n", name);
1318
1319 while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
1320 dump_dde(ddt, &dde, walk);
1321
1322 ASSERT3U(error, ==, ENOENT);
1323
1324 (void) printf("\n");
1325 }
1326
1327 static void
1328 dump_all_ddts(spa_t *spa)
1329 {
1330 ddt_histogram_t ddh_total;
1331 ddt_stat_t dds_total;
1332
1333 bzero(&ddh_total, sizeof (ddh_total));
1334 bzero(&dds_total, sizeof (dds_total));
1335
1336 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1337 ddt_t *ddt = spa->spa_ddt[c];
1338 for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1339 for (enum ddt_class class = 0; class < DDT_CLASSES;
1340 class++) {
1341 dump_ddt(ddt, type, class);
1342 }
1343 }
1344 }
1345
1346 ddt_get_dedup_stats(spa, &dds_total);
1347
1348 if (dds_total.dds_blocks == 0) {
1349 (void) printf("All DDTs are empty\n");
1350 return;
1351 }
1352
1353 (void) printf("\n");
1354
1355 if (dump_opt['D'] > 1) {
1356 (void) printf("DDT histogram (aggregated over all DDTs):\n");
1357 ddt_get_dedup_histogram(spa, &ddh_total);
1358 zpool_dump_ddt(&dds_total, &ddh_total);
1359 }
1360
1361 dump_dedup_ratio(&dds_total);
1362 }
1363
1364 static void
1365 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
1366 {
1367 char *prefix = arg;
1368
1369 (void) printf("%s [%llu,%llu) length %llu\n",
1370 prefix,
1371 (u_longlong_t)start,
1372 (u_longlong_t)(start + size),
1373 (u_longlong_t)(size));
1374 }
1375
1376 static void
1377 dump_dtl(vdev_t *vd, int indent)
1378 {
1379 spa_t *spa = vd->vdev_spa;
1380 boolean_t required;
1381 const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
1382 "outage" };
1383 char prefix[256];
1384
1385 spa_vdev_state_enter(spa, SCL_NONE);
1386 required = vdev_dtl_required(vd);
1387 (void) spa_vdev_state_exit(spa, NULL, 0);
1388
1389 if (indent == 0)
1390 (void) printf("\nDirty time logs:\n\n");
1391
1392 (void) printf("\t%*s%s [%s]\n", indent, "",
1393 vd->vdev_path ? vd->vdev_path :
1394 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
1395 required ? "DTL-required" : "DTL-expendable");
1396
1397 for (int t = 0; t < DTL_TYPES; t++) {
1398 range_tree_t *rt = vd->vdev_dtl[t];
1399 if (range_tree_space(rt) == 0)
1400 continue;
1401 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
1402 indent + 2, "", name[t]);
1403 range_tree_walk(rt, dump_dtl_seg, prefix);
1404 if (dump_opt['d'] > 5 && vd->vdev_children == 0)
1405 dump_spacemap(spa->spa_meta_objset,
1406 vd->vdev_dtl_sm);
1407 }
1408
1409 for (unsigned c = 0; c < vd->vdev_children; c++)
1410 dump_dtl(vd->vdev_child[c], indent + 4);
1411 }
1412
1413 static void
1414 dump_history(spa_t *spa)
1415 {
1416 nvlist_t **events = NULL;
1417 char *buf;
1418 uint64_t resid, len, off = 0;
1419 uint_t num = 0;
1420 int error;
1421 time_t tsec;
1422 struct tm t;
1423 char tbuf[30];
1424 char internalstr[MAXPATHLEN];
1425
1426 if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
1427 (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
1428 __func__);
1429 return;
1430 }
1431
1432 do {
1433 len = SPA_OLD_MAXBLOCKSIZE;
1434
1435 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
1436 (void) fprintf(stderr, "Unable to read history: "
1437 "error %d\n", error);
1438 free(buf);
1439 return;
1440 }
1441
1442 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
1443 break;
1444
1445 off -= resid;
1446 } while (len != 0);
1447
1448 (void) printf("\nHistory:\n");
1449 for (unsigned i = 0; i < num; i++) {
1450 uint64_t time, txg, ievent;
1451 char *cmd, *intstr;
1452 boolean_t printed = B_FALSE;
1453
1454 if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
1455 &time) != 0)
1456 goto next;
1457 if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
1458 &cmd) != 0) {
1459 if (nvlist_lookup_uint64(events[i],
1460 ZPOOL_HIST_INT_EVENT, &ievent) != 0)
1461 goto next;
1462 verify(nvlist_lookup_uint64(events[i],
1463 ZPOOL_HIST_TXG, &txg) == 0);
1464 verify(nvlist_lookup_string(events[i],
1465 ZPOOL_HIST_INT_STR, &intstr) == 0);
1466 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
1467 goto next;
1468
1469 (void) snprintf(internalstr,
1470 sizeof (internalstr),
1471 "[internal %s txg:%lld] %s",
1472 zfs_history_event_names[ievent],
1473 (longlong_t)txg, intstr);
1474 cmd = internalstr;
1475 }
1476 tsec = time;
1477 (void) localtime_r(&tsec, &t);
1478 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
1479 (void) printf("%s %s\n", tbuf, cmd);
1480 printed = B_TRUE;
1481
1482 next:
1483 if (dump_opt['h'] > 1) {
1484 if (!printed)
1485 (void) printf("unrecognized record:\n");
1486 dump_nvlist(events[i], 2);
1487 }
1488 }
1489 free(buf);
1490 }
1491
1492 /*ARGSUSED*/
1493 static void
1494 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
1495 {
1496 }
1497
1498 static uint64_t
1499 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
1500 const zbookmark_phys_t *zb)
1501 {
1502 if (dnp == NULL) {
1503 ASSERT(zb->zb_level < 0);
1504 if (zb->zb_object == 0)
1505 return (zb->zb_blkid);
1506 return (zb->zb_blkid * BP_GET_LSIZE(bp));
1507 }
1508
1509 ASSERT(zb->zb_level >= 0);
1510
1511 return ((zb->zb_blkid <<
1512 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
1513 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
1514 }
1515
1516 static void
1517 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
1518 boolean_t bp_freed)
1519 {
1520 const dva_t *dva = bp->blk_dva;
1521 int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
1522 int i;
1523
1524 if (dump_opt['b'] >= 6) {
1525 snprintf_blkptr(blkbuf, buflen, bp);
1526 if (bp_freed) {
1527 (void) snprintf(blkbuf + strlen(blkbuf),
1528 buflen - strlen(blkbuf), " %s", "FREE");
1529 }
1530 return;
1531 }
1532
1533 if (BP_IS_EMBEDDED(bp)) {
1534 (void) sprintf(blkbuf,
1535 "EMBEDDED et=%u %llxL/%llxP B=%llu",
1536 (int)BPE_GET_ETYPE(bp),
1537 (u_longlong_t)BPE_GET_LSIZE(bp),
1538 (u_longlong_t)BPE_GET_PSIZE(bp),
1539 (u_longlong_t)bp->blk_birth);
1540 return;
1541 }
1542
1543 blkbuf[0] = '\0';
1544
1545 for (i = 0; i < ndvas; i++)
1546 (void) snprintf(blkbuf + strlen(blkbuf),
1547 buflen - strlen(blkbuf), "%llu:%llx:%llx ",
1548 (u_longlong_t)DVA_GET_VDEV(&dva[i]),
1549 (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
1550 (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
1551
1552 if (BP_IS_HOLE(bp)) {
1553 (void) snprintf(blkbuf + strlen(blkbuf),
1554 buflen - strlen(blkbuf),
1555 "%llxL B=%llu",
1556 (u_longlong_t)BP_GET_LSIZE(bp),
1557 (u_longlong_t)bp->blk_birth);
1558 } else {
1559 (void) snprintf(blkbuf + strlen(blkbuf),
1560 buflen - strlen(blkbuf),
1561 "%llxL/%llxP F=%llu B=%llu/%llu",
1562 (u_longlong_t)BP_GET_LSIZE(bp),
1563 (u_longlong_t)BP_GET_PSIZE(bp),
1564 (u_longlong_t)BP_GET_FILL(bp),
1565 (u_longlong_t)bp->blk_birth,
1566 (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
1567 if (bp_freed)
1568 (void) snprintf(blkbuf + strlen(blkbuf),
1569 buflen - strlen(blkbuf), " %s", "FREE");
1570 }
1571 }
1572
1573 static void
1574 print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
1575 const dnode_phys_t *dnp)
1576 {
1577 char blkbuf[BP_SPRINTF_LEN];
1578 int l;
1579
1580 if (!BP_IS_EMBEDDED(bp)) {
1581 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
1582 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
1583 }
1584
1585 (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
1586
1587 ASSERT(zb->zb_level >= 0);
1588
1589 for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
1590 if (l == zb->zb_level) {
1591 (void) printf("L%llx", (u_longlong_t)zb->zb_level);
1592 } else {
1593 (void) printf(" ");
1594 }
1595 }
1596
1597 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
1598 (void) printf("%s\n", blkbuf);
1599 }
1600
1601 static int
1602 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
1603 blkptr_t *bp, const zbookmark_phys_t *zb)
1604 {
1605 int err = 0;
1606
1607 if (bp->blk_birth == 0)
1608 return (0);
1609
1610 print_indirect(bp, zb, dnp);
1611
1612 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
1613 arc_flags_t flags = ARC_FLAG_WAIT;
1614 int i;
1615 blkptr_t *cbp;
1616 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1617 arc_buf_t *buf;
1618 uint64_t fill = 0;
1619 ASSERT(!BP_IS_REDACTED(bp));
1620
1621 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
1622 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
1623 if (err)
1624 return (err);
1625 ASSERT(buf->b_data);
1626
1627 /* recursively visit blocks below this */
1628 cbp = buf->b_data;
1629 for (i = 0; i < epb; i++, cbp++) {
1630 zbookmark_phys_t czb;
1631
1632 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1633 zb->zb_level - 1,
1634 zb->zb_blkid * epb + i);
1635 err = visit_indirect(spa, dnp, cbp, &czb);
1636 if (err)
1637 break;
1638 fill += BP_GET_FILL(cbp);
1639 }
1640 if (!err)
1641 ASSERT3U(fill, ==, BP_GET_FILL(bp));
1642 arc_buf_destroy(buf, &buf);
1643 }
1644
1645 return (err);
1646 }
1647
1648 /*ARGSUSED*/
1649 static void
1650 dump_indirect(dnode_t *dn)
1651 {
1652 dnode_phys_t *dnp = dn->dn_phys;
1653 int j;
1654 zbookmark_phys_t czb;
1655
1656 (void) printf("Indirect blocks:\n");
1657
1658 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
1659 dn->dn_object, dnp->dn_nlevels - 1, 0);
1660 for (j = 0; j < dnp->dn_nblkptr; j++) {
1661 czb.zb_blkid = j;
1662 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
1663 &dnp->dn_blkptr[j], &czb);
1664 }
1665
1666 (void) printf("\n");
1667 }
1668
1669 /*ARGSUSED*/
1670 static void
1671 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
1672 {
1673 dsl_dir_phys_t *dd = data;
1674 time_t crtime;
1675 char nice[32];
1676
1677 /* make sure nicenum has enough space */
1678 CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
1679
1680 if (dd == NULL)
1681 return;
1682
1683 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
1684
1685 crtime = dd->dd_creation_time;
1686 (void) printf("\t\tcreation_time = %s", ctime(&crtime));
1687 (void) printf("\t\thead_dataset_obj = %llu\n",
1688 (u_longlong_t)dd->dd_head_dataset_obj);
1689 (void) printf("\t\tparent_dir_obj = %llu\n",
1690 (u_longlong_t)dd->dd_parent_obj);
1691 (void) printf("\t\torigin_obj = %llu\n",
1692 (u_longlong_t)dd->dd_origin_obj);
1693 (void) printf("\t\tchild_dir_zapobj = %llu\n",
1694 (u_longlong_t)dd->dd_child_dir_zapobj);
1695 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
1696 (void) printf("\t\tused_bytes = %s\n", nice);
1697 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
1698 (void) printf("\t\tcompressed_bytes = %s\n", nice);
1699 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
1700 (void) printf("\t\tuncompressed_bytes = %s\n", nice);
1701 zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
1702 (void) printf("\t\tquota = %s\n", nice);
1703 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
1704 (void) printf("\t\treserved = %s\n", nice);
1705 (void) printf("\t\tprops_zapobj = %llu\n",
1706 (u_longlong_t)dd->dd_props_zapobj);
1707 (void) printf("\t\tdeleg_zapobj = %llu\n",
1708 (u_longlong_t)dd->dd_deleg_zapobj);
1709 (void) printf("\t\tflags = %llx\n",
1710 (u_longlong_t)dd->dd_flags);
1711
1712 #define DO(which) \
1713 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
1714 sizeof (nice)); \
1715 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
1716 DO(HEAD);
1717 DO(SNAP);
1718 DO(CHILD);
1719 DO(CHILD_RSRV);
1720 DO(REFRSRV);
1721 #undef DO
1722 (void) printf("\t\tclones = %llu\n",
1723 (u_longlong_t)dd->dd_clones);
1724 }
1725
1726 /*ARGSUSED*/
1727 static void
1728 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
1729 {
1730 dsl_dataset_phys_t *ds = data;
1731 time_t crtime;
1732 char used[32], compressed[32], uncompressed[32], unique[32];
1733 char blkbuf[BP_SPRINTF_LEN];
1734
1735 /* make sure nicenum has enough space */
1736 CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
1737 CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
1738 CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
1739 CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
1740
1741 if (ds == NULL)
1742 return;
1743
1744 ASSERT(size == sizeof (*ds));
1745 crtime = ds->ds_creation_time;
1746 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
1747 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
1748 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
1749 sizeof (uncompressed));
1750 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
1751 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
1752
1753 (void) printf("\t\tdir_obj = %llu\n",
1754 (u_longlong_t)ds->ds_dir_obj);
1755 (void) printf("\t\tprev_snap_obj = %llu\n",
1756 (u_longlong_t)ds->ds_prev_snap_obj);
1757 (void) printf("\t\tprev_snap_txg = %llu\n",
1758 (u_longlong_t)ds->ds_prev_snap_txg);
1759 (void) printf("\t\tnext_snap_obj = %llu\n",
1760 (u_longlong_t)ds->ds_next_snap_obj);
1761 (void) printf("\t\tsnapnames_zapobj = %llu\n",
1762 (u_longlong_t)ds->ds_snapnames_zapobj);
1763 (void) printf("\t\tnum_children = %llu\n",
1764 (u_longlong_t)ds->ds_num_children);
1765 (void) printf("\t\tuserrefs_obj = %llu\n",
1766 (u_longlong_t)ds->ds_userrefs_obj);
1767 (void) printf("\t\tcreation_time = %s", ctime(&crtime));
1768 (void) printf("\t\tcreation_txg = %llu\n",
1769 (u_longlong_t)ds->ds_creation_txg);
1770 (void) printf("\t\tdeadlist_obj = %llu\n",
1771 (u_longlong_t)ds->ds_deadlist_obj);
1772 (void) printf("\t\tused_bytes = %s\n", used);
1773 (void) printf("\t\tcompressed_bytes = %s\n", compressed);
1774 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
1775 (void) printf("\t\tunique = %s\n", unique);
1776 (void) printf("\t\tfsid_guid = %llu\n",
1777 (u_longlong_t)ds->ds_fsid_guid);
1778 (void) printf("\t\tguid = %llu\n",
1779 (u_longlong_t)ds->ds_guid);
1780 (void) printf("\t\tflags = %llx\n",
1781 (u_longlong_t)ds->ds_flags);
1782 (void) printf("\t\tnext_clones_obj = %llu\n",
1783 (u_longlong_t)ds->ds_next_clones_obj);
1784 (void) printf("\t\tprops_obj = %llu\n",
1785 (u_longlong_t)ds->ds_props_obj);
1786 (void) printf("\t\tbp = %s\n", blkbuf);
1787 }
1788
1789 /* ARGSUSED */
1790 static int
1791 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1792 {
1793 char blkbuf[BP_SPRINTF_LEN];
1794
1795 if (bp->blk_birth != 0) {
1796 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
1797 (void) printf("\t%s\n", blkbuf);
1798 }
1799 return (0);
1800 }
1801
1802 static void
1803 dump_bptree(objset_t *os, uint64_t obj, const char *name)
1804 {
1805 char bytes[32];
1806 bptree_phys_t *bt;
1807 dmu_buf_t *db;
1808
1809 /* make sure nicenum has enough space */
1810 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1811
1812 if (dump_opt['d'] < 3)
1813 return;
1814
1815 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
1816 bt = db->db_data;
1817 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
1818 (void) printf("\n %s: %llu datasets, %s\n",
1819 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
1820 dmu_buf_rele(db, FTAG);
1821
1822 if (dump_opt['d'] < 5)
1823 return;
1824
1825 (void) printf("\n");
1826
1827 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
1828 }
1829
1830 /* ARGSUSED */
1831 static int
1832 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
1833 {
1834 char blkbuf[BP_SPRINTF_LEN];
1835
1836 ASSERT(bp->blk_birth != 0);
1837 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
1838 (void) printf("\t%s\n", blkbuf);
1839 return (0);
1840 }
1841
1842 static void
1843 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
1844 {
1845 char bytes[32];
1846 char comp[32];
1847 char uncomp[32];
1848 uint64_t i;
1849
1850 /* make sure nicenum has enough space */
1851 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1852 CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
1853 CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
1854
1855 if (dump_opt['d'] < 3)
1856 return;
1857
1858 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
1859 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
1860 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
1861 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
1862 if (bpo->bpo_havefreed) {
1863 (void) printf(" %*s: object %llu, %llu local "
1864 "blkptrs, %llu freed, %llu subobjs in object %llu, "
1865 "%s (%s/%s comp)\n",
1866 indent * 8, name,
1867 (u_longlong_t)bpo->bpo_object,
1868 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1869 (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
1870 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
1871 (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
1872 bytes, comp, uncomp);
1873 } else {
1874 (void) printf(" %*s: object %llu, %llu local "
1875 "blkptrs, %llu subobjs in object %llu, "
1876 "%s (%s/%s comp)\n",
1877 indent * 8, name,
1878 (u_longlong_t)bpo->bpo_object,
1879 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1880 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
1881 (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
1882 bytes, comp, uncomp);
1883 }
1884
1885 for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
1886 uint64_t subobj;
1887 bpobj_t subbpo;
1888 int error;
1889 VERIFY0(dmu_read(bpo->bpo_os,
1890 bpo->bpo_phys->bpo_subobjs,
1891 i * sizeof (subobj), sizeof (subobj), &subobj, 0));
1892 error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
1893 if (error != 0) {
1894 (void) printf("ERROR %u while trying to open "
1895 "subobj id %llu\n",
1896 error, (u_longlong_t)subobj);
1897 continue;
1898 }
1899 dump_full_bpobj(&subbpo, "subobj", indent + 1);
1900 bpobj_close(&subbpo);
1901 }
1902 } else {
1903 if (bpo->bpo_havefreed) {
1904 (void) printf(" %*s: object %llu, %llu blkptrs, "
1905 "%llu freed, %s\n",
1906 indent * 8, name,
1907 (u_longlong_t)bpo->bpo_object,
1908 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1909 (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
1910 bytes);
1911 } else {
1912 (void) printf(" %*s: object %llu, %llu blkptrs, "
1913 "%s\n",
1914 indent * 8, name,
1915 (u_longlong_t)bpo->bpo_object,
1916 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1917 bytes);
1918 }
1919 }
1920
1921 if (dump_opt['d'] < 5)
1922 return;
1923
1924
1925 if (indent == 0) {
1926 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
1927 (void) printf("\n");
1928 }
1929 }
1930
1931 static int
1932 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
1933 boolean_t print_list)
1934 {
1935 int err = 0;
1936 zfs_bookmark_phys_t prop;
1937 objset_t *mos = dp->dp_spa->spa_meta_objset;
1938 err = dsl_bookmark_lookup(dp, name, NULL, &prop);
1939
1940 if (err != 0) {
1941 return (err);
1942 }
1943
1944 (void) printf("\t#%s: ", strchr(name, '#') + 1);
1945 (void) printf("{guid: %llx creation_txg: %llu creation_time: "
1946 "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
1947 (u_longlong_t)prop.zbm_creation_txg,
1948 (u_longlong_t)prop.zbm_creation_time,
1949 (u_longlong_t)prop.zbm_redaction_obj);
1950
1951 IMPLY(print_list, print_redact);
1952 if (!print_redact || prop.zbm_redaction_obj == 0)
1953 return (0);
1954
1955 redaction_list_t *rl;
1956 VERIFY0(dsl_redaction_list_hold_obj(dp,
1957 prop.zbm_redaction_obj, FTAG, &rl));
1958
1959 redaction_list_phys_t *rlp = rl->rl_phys;
1960 (void) printf("\tRedacted:\n\t\tProgress: ");
1961 if (rlp->rlp_last_object != UINT64_MAX ||
1962 rlp->rlp_last_blkid != UINT64_MAX) {
1963 (void) printf("%llu %llu (incomplete)\n",
1964 (u_longlong_t)rlp->rlp_last_object,
1965 (u_longlong_t)rlp->rlp_last_blkid);
1966 } else {
1967 (void) printf("complete\n");
1968 }
1969 (void) printf("\t\tSnapshots: [");
1970 for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
1971 if (i > 0)
1972 (void) printf(", ");
1973 (void) printf("%0llu",
1974 (u_longlong_t)rlp->rlp_snaps[i]);
1975 }
1976 (void) printf("]\n\t\tLength: %llu\n",
1977 (u_longlong_t)rlp->rlp_num_entries);
1978
1979 if (!print_list) {
1980 dsl_redaction_list_rele(rl, FTAG);
1981 return (0);
1982 }
1983
1984 if (rlp->rlp_num_entries == 0) {
1985 dsl_redaction_list_rele(rl, FTAG);
1986 (void) printf("\t\tRedaction List: []\n\n");
1987 return (0);
1988 }
1989
1990 redact_block_phys_t *rbp_buf;
1991 uint64_t size;
1992 dmu_object_info_t doi;
1993
1994 VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
1995 size = doi.doi_max_offset;
1996 rbp_buf = kmem_alloc(size, KM_SLEEP);
1997
1998 err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
1999 rbp_buf, 0);
2000 if (err != 0) {
2001 dsl_redaction_list_rele(rl, FTAG);
2002 kmem_free(rbp_buf, size);
2003 return (err);
2004 }
2005
2006 (void) printf("\t\tRedaction List: [{object: %llx, offset: "
2007 "%llx, blksz: %x, count: %llx}",
2008 (u_longlong_t)rbp_buf[0].rbp_object,
2009 (u_longlong_t)rbp_buf[0].rbp_blkid,
2010 (uint_t)(redact_block_get_size(&rbp_buf[0])),
2011 (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
2012
2013 for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
2014 (void) printf(",\n\t\t{object: %llx, offset: %llx, "
2015 "blksz: %x, count: %llx}",
2016 (u_longlong_t)rbp_buf[i].rbp_object,
2017 (u_longlong_t)rbp_buf[i].rbp_blkid,
2018 (uint_t)(redact_block_get_size(&rbp_buf[i])),
2019 (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
2020 }
2021 dsl_redaction_list_rele(rl, FTAG);
2022 kmem_free(rbp_buf, size);
2023 (void) printf("]\n\n");
2024 return (0);
2025 }
2026
2027 static void
2028 dump_bookmarks(objset_t *os, int verbosity)
2029 {
2030 zap_cursor_t zc;
2031 zap_attribute_t attr;
2032 dsl_dataset_t *ds = dmu_objset_ds(os);
2033 dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2034 objset_t *mos = os->os_spa->spa_meta_objset;
2035 if (verbosity < 4)
2036 return;
2037 dsl_pool_config_enter(dp, FTAG);
2038
2039 for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
2040 zap_cursor_retrieve(&zc, &attr) == 0;
2041 zap_cursor_advance(&zc)) {
2042 char osname[ZFS_MAX_DATASET_NAME_LEN];
2043 char buf[ZFS_MAX_DATASET_NAME_LEN];
2044 dmu_objset_name(os, osname);
2045 VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname,
2046 attr.za_name));
2047 (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
2048 }
2049 zap_cursor_fini(&zc);
2050 dsl_pool_config_exit(dp, FTAG);
2051 }
2052
2053 static void
2054 bpobj_count_refd(bpobj_t *bpo)
2055 {
2056 mos_obj_refd(bpo->bpo_object);
2057
2058 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2059 mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
2060 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2061 uint64_t subobj;
2062 bpobj_t subbpo;
2063 int error;
2064 VERIFY0(dmu_read(bpo->bpo_os,
2065 bpo->bpo_phys->bpo_subobjs,
2066 i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2067 error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2068 if (error != 0) {
2069 (void) printf("ERROR %u while trying to open "
2070 "subobj id %llu\n",
2071 error, (u_longlong_t)subobj);
2072 continue;
2073 }
2074 bpobj_count_refd(&subbpo);
2075 bpobj_close(&subbpo);
2076 }
2077 }
2078 }
2079
2080 static int
2081 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
2082 {
2083 spa_t *spa = arg;
2084 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2085 if (dle->dle_bpobj.bpo_object != empty_bpobj)
2086 bpobj_count_refd(&dle->dle_bpobj);
2087 return (0);
2088 }
2089
2090 static int
2091 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
2092 {
2093 ASSERT(arg == NULL);
2094 if (dump_opt['d'] >= 5) {
2095 char buf[128];
2096 (void) snprintf(buf, sizeof (buf),
2097 "mintxg %llu -> obj %llu",
2098 (longlong_t)dle->dle_mintxg,
2099 (longlong_t)dle->dle_bpobj.bpo_object);
2100
2101 dump_full_bpobj(&dle->dle_bpobj, buf, 0);
2102 } else {
2103 (void) printf("mintxg %llu -> obj %llu\n",
2104 (longlong_t)dle->dle_mintxg,
2105 (longlong_t)dle->dle_bpobj.bpo_object);
2106 }
2107 return (0);
2108 }
2109
2110 static void
2111 dump_blkptr_list(dsl_deadlist_t *dl, char *name)
2112 {
2113 char bytes[32];
2114 char comp[32];
2115 char uncomp[32];
2116 char entries[32];
2117 spa_t *spa = dmu_objset_spa(dl->dl_os);
2118 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2119
2120 if (dl->dl_oldfmt) {
2121 if (dl->dl_bpobj.bpo_object != empty_bpobj)
2122 bpobj_count_refd(&dl->dl_bpobj);
2123 } else {
2124 mos_obj_refd(dl->dl_object);
2125 dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
2126 }
2127
2128 /* make sure nicenum has enough space */
2129 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
2130 CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
2131 CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
2132 CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ);
2133
2134 if (dump_opt['d'] < 3)
2135 return;
2136
2137 if (dl->dl_oldfmt) {
2138 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
2139 return;
2140 }
2141
2142 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
2143 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
2144 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
2145 zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
2146 (void) printf("\n %s: %s (%s/%s comp), %s entries\n",
2147 name, bytes, comp, uncomp, entries);
2148
2149 if (dump_opt['d'] < 4)
2150 return;
2151
2152 (void) printf("\n");
2153
2154 dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
2155 }
2156
2157 static int
2158 verify_dd_livelist(objset_t *os)
2159 {
2160 uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
2161 dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2162 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
2163
2164 ASSERT(!dmu_objset_is_snapshot(os));
2165 if (!dsl_deadlist_is_open(&dd->dd_livelist))
2166 return (0);
2167 dsl_pool_config_enter(dp, FTAG);
2168 dsl_deadlist_space(&dd->dd_livelist, &ll_used,
2169 &ll_comp, &ll_uncomp);
2170
2171 dsl_dataset_t *origin_ds;
2172 ASSERT(dsl_pool_config_held(dp));
2173 VERIFY0(dsl_dataset_hold_obj(dp,
2174 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
2175 VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
2176 &used, &comp, &uncomp));
2177 dsl_dataset_rele(origin_ds, FTAG);
2178 dsl_pool_config_exit(dp, FTAG);
2179 /*
2180 * It's possible that the dataset's uncomp space is larger than the
2181 * livelist's because livelists do not track embedded block pointers
2182 */
2183 if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
2184 char nice_used[32], nice_comp[32], nice_uncomp[32];
2185 (void) printf("Discrepancy in space accounting:\n");
2186 zdb_nicenum(used, nice_used, sizeof (nice_used));
2187 zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
2188 zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
2189 (void) printf("dir: used %s, comp %s, uncomp %s\n",
2190 nice_used, nice_comp, nice_uncomp);
2191 zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
2192 zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
2193 zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
2194 (void) printf("livelist: used %s, comp %s, uncomp %s\n",
2195 nice_used, nice_comp, nice_uncomp);
2196 return (1);
2197 }
2198 return (0);
2199 }
2200
2201 static avl_tree_t idx_tree;
2202 static avl_tree_t domain_tree;
2203 static boolean_t fuid_table_loaded;
2204 static objset_t *sa_os = NULL;
2205 static sa_attr_type_t *sa_attr_table = NULL;
2206
2207 static int
2208 open_objset(const char *path, void *tag, objset_t **osp)
2209 {
2210 int err;
2211 uint64_t sa_attrs = 0;
2212 uint64_t version = 0;
2213
2214 VERIFY3P(sa_os, ==, NULL);
2215 /*
2216 * We can't own an objset if it's redacted. Therefore, we do this
2217 * dance: hold the objset, then acquire a long hold on its dataset, then
2218 * release the pool (which is held as part of holding the objset).
2219 */
2220 err = dmu_objset_hold(path, tag, osp);
2221 if (err != 0) {
2222 (void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
2223 path, strerror(err));
2224 return (err);
2225 }
2226 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
2227 dsl_pool_rele(dmu_objset_pool(*osp), tag);
2228
2229 if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) {
2230 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2231 8, 1, &version);
2232 if (version >= ZPL_VERSION_SA) {
2233 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
2234 8, 1, &sa_attrs);
2235 }
2236 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
2237 &sa_attr_table);
2238 if (err != 0) {
2239 (void) fprintf(stderr, "sa_setup failed: %s\n",
2240 strerror(err));
2241 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
2242 dsl_dataset_rele(dmu_objset_ds(*osp), tag);
2243 *osp = NULL;
2244 }
2245 }
2246 sa_os = *osp;
2247
2248 return (0);
2249 }
2250
2251 static void
2252 close_objset(objset_t *os, void *tag)
2253 {
2254 VERIFY3P(os, ==, sa_os);
2255 if (os->os_sa != NULL)
2256 sa_tear_down(os);
2257 dsl_dataset_long_rele(dmu_objset_ds(os), tag);
2258 dsl_dataset_rele(dmu_objset_ds(os), tag);
2259 sa_attr_table = NULL;
2260 sa_os = NULL;
2261 }
2262
2263 static void
2264 fuid_table_destroy(void)
2265 {
2266 if (fuid_table_loaded) {
2267 zfs_fuid_table_destroy(&idx_tree, &domain_tree);
2268 fuid_table_loaded = B_FALSE;
2269 }
2270 }
2271
2272 /*
2273 * print uid or gid information.
2274 * For normal POSIX id just the id is printed in decimal format.
2275 * For CIFS files with FUID the fuid is printed in hex followed by
2276 * the domain-rid string.
2277 */
2278 static void
2279 print_idstr(uint64_t id, const char *id_type)
2280 {
2281 if (FUID_INDEX(id)) {
2282 char *domain;
2283
2284 domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
2285 (void) printf("\t%s %llx [%s-%d]\n", id_type,
2286 (u_longlong_t)id, domain, (int)FUID_RID(id));
2287 } else {
2288 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
2289 }
2290
2291 }
2292
2293 static void
2294 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
2295 {
2296 uint32_t uid_idx, gid_idx;
2297
2298 uid_idx = FUID_INDEX(uid);
2299 gid_idx = FUID_INDEX(gid);
2300
2301 /* Load domain table, if not already loaded */
2302 if (!fuid_table_loaded && (uid_idx || gid_idx)) {
2303 uint64_t fuid_obj;
2304
2305 /* first find the fuid object. It lives in the master node */
2306 VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
2307 8, 1, &fuid_obj) == 0);
2308 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
2309 (void) zfs_fuid_table_load(os, fuid_obj,
2310 &idx_tree, &domain_tree);
2311 fuid_table_loaded = B_TRUE;
2312 }
2313
2314 print_idstr(uid, "uid");
2315 print_idstr(gid, "gid");
2316 }
2317
2318 static void
2319 dump_znode_sa_xattr(sa_handle_t *hdl)
2320 {
2321 nvlist_t *sa_xattr;
2322 nvpair_t *elem = NULL;
2323 int sa_xattr_size = 0;
2324 int sa_xattr_entries = 0;
2325 int error;
2326 char *sa_xattr_packed;
2327
2328 error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
2329 if (error || sa_xattr_size == 0)
2330 return;
2331
2332 sa_xattr_packed = malloc(sa_xattr_size);
2333 if (sa_xattr_packed == NULL)
2334 return;
2335
2336 error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
2337 sa_xattr_packed, sa_xattr_size);
2338 if (error) {
2339 free(sa_xattr_packed);
2340 return;
2341 }
2342
2343 error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
2344 if (error) {
2345 free(sa_xattr_packed);
2346 return;
2347 }
2348
2349 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
2350 sa_xattr_entries++;
2351
2352 (void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
2353 sa_xattr_size, sa_xattr_entries);
2354 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
2355 uchar_t *value;
2356 uint_t cnt, idx;
2357
2358 (void) printf("\t\t%s = ", nvpair_name(elem));
2359 nvpair_value_byte_array(elem, &value, &cnt);
2360 for (idx = 0; idx < cnt; ++idx) {
2361 if (isprint(value[idx]))
2362 (void) putchar(value[idx]);
2363 else
2364 (void) printf("\\%3.3o", value[idx]);
2365 }
2366 (void) putchar('\n');
2367 }
2368
2369 nvlist_free(sa_xattr);
2370 free(sa_xattr_packed);
2371 }
2372
2373 /*ARGSUSED*/
2374 static void
2375 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
2376 {
2377 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
2378 sa_handle_t *hdl;
2379 uint64_t xattr, rdev, gen;
2380 uint64_t uid, gid, mode, fsize, parent, links;
2381 uint64_t pflags;
2382 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
2383 time_t z_crtime, z_atime, z_mtime, z_ctime;
2384 sa_bulk_attr_t bulk[12];
2385 int idx = 0;
2386 int error;
2387
2388 VERIFY3P(os, ==, sa_os);
2389 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
2390 (void) printf("Failed to get handle for SA znode\n");
2391 return;
2392 }
2393
2394 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
2395 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
2396 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
2397 &links, 8);
2398 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
2399 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
2400 &mode, 8);
2401 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
2402 NULL, &parent, 8);
2403 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
2404 &fsize, 8);
2405 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
2406 acctm, 16);
2407 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
2408 modtm, 16);
2409 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
2410 crtm, 16);
2411 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
2412 chgtm, 16);
2413 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
2414 &pflags, 8);
2415
2416 if (sa_bulk_lookup(hdl, bulk, idx)) {
2417 (void) sa_handle_destroy(hdl);
2418 return;
2419 }
2420
2421 z_crtime = (time_t)crtm[0];
2422 z_atime = (time_t)acctm[0];
2423 z_mtime = (time_t)modtm[0];
2424 z_ctime = (time_t)chgtm[0];
2425
2426 if (dump_opt['d'] > 4) {
2427 error = zfs_obj_to_path(os, object, path, sizeof (path));
2428 if (error == ESTALE) {
2429 (void) snprintf(path, sizeof (path), "on delete queue");
2430 } else if (error != 0) {
2431 leaked_objects++;
2432 (void) snprintf(path, sizeof (path),
2433 "path not found, possibly leaked");
2434 }
2435 (void) printf("\tpath %s\n", path);
2436 }
2437 dump_uidgid(os, uid, gid);
2438 (void) printf("\tatime %s", ctime(&z_atime));
2439 (void) printf("\tmtime %s", ctime(&z_mtime));
2440 (void) printf("\tctime %s", ctime(&z_ctime));
2441 (void) printf("\tcrtime %s", ctime(&z_crtime));
2442 (void) printf("\tgen %llu\n", (u_longlong_t)gen);
2443 (void) printf("\tmode %llo\n", (u_longlong_t)mode);
2444 (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
2445 (void) printf("\tparent %llu\n", (u_longlong_t)parent);
2446 (void) printf("\tlinks %llu\n", (u_longlong_t)links);
2447 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
2448 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
2449 uint64_t projid;
2450
2451 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
2452 sizeof (uint64_t)) == 0)
2453 (void) printf("\tprojid %llu\n", (u_longlong_t)projid);
2454 }
2455 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
2456 sizeof (uint64_t)) == 0)
2457 (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
2458 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
2459 sizeof (uint64_t)) == 0)
2460 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
2461 dump_znode_sa_xattr(hdl);
2462 sa_handle_destroy(hdl);
2463 }
2464
2465 /*ARGSUSED*/
2466 static void
2467 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
2468 {
2469 }
2470
2471 /*ARGSUSED*/
2472 static void
2473 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
2474 {
2475 }
2476
2477 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
2478 dump_none, /* unallocated */
2479 dump_zap, /* object directory */
2480 dump_uint64, /* object array */
2481 dump_none, /* packed nvlist */
2482 dump_packed_nvlist, /* packed nvlist size */
2483 dump_none, /* bpobj */
2484 dump_bpobj, /* bpobj header */
2485 dump_none, /* SPA space map header */
2486 dump_none, /* SPA space map */
2487 dump_none, /* ZIL intent log */
2488 dump_dnode, /* DMU dnode */
2489 dump_dmu_objset, /* DMU objset */
2490 dump_dsl_dir, /* DSL directory */
2491 dump_zap, /* DSL directory child map */
2492 dump_zap, /* DSL dataset snap map */
2493 dump_zap, /* DSL props */
2494 dump_dsl_dataset, /* DSL dataset */
2495 dump_znode, /* ZFS znode */
2496 dump_acl, /* ZFS V0 ACL */
2497 dump_uint8, /* ZFS plain file */
2498 dump_zpldir, /* ZFS directory */
2499 dump_zap, /* ZFS master node */
2500 dump_zap, /* ZFS delete queue */
2501 dump_uint8, /* zvol object */
2502 dump_zap, /* zvol prop */
2503 dump_uint8, /* other uint8[] */
2504 dump_uint64, /* other uint64[] */
2505 dump_zap, /* other ZAP */
2506 dump_zap, /* persistent error log */
2507 dump_uint8, /* SPA history */
2508 dump_history_offsets, /* SPA history offsets */
2509 dump_zap, /* Pool properties */
2510 dump_zap, /* DSL permissions */
2511 dump_acl, /* ZFS ACL */
2512 dump_uint8, /* ZFS SYSACL */
2513 dump_none, /* FUID nvlist */
2514 dump_packed_nvlist, /* FUID nvlist size */
2515 dump_zap, /* DSL dataset next clones */
2516 dump_zap, /* DSL scrub queue */
2517 dump_zap, /* ZFS user/group/project used */
2518 dump_zap, /* ZFS user/group/project quota */
2519 dump_zap, /* snapshot refcount tags */
2520 dump_ddt_zap, /* DDT ZAP object */
2521 dump_zap, /* DDT statistics */
2522 dump_znode, /* SA object */
2523 dump_zap, /* SA Master Node */
2524 dump_sa_attrs, /* SA attribute registration */
2525 dump_sa_layouts, /* SA attribute layouts */
2526 dump_zap, /* DSL scrub translations */
2527 dump_none, /* fake dedup BP */
2528 dump_zap, /* deadlist */
2529 dump_none, /* deadlist hdr */
2530 dump_zap, /* dsl clones */
2531 dump_bpobj_subobjs, /* bpobj subobjs */
2532 dump_unknown, /* Unknown type, must be last */
2533 };
2534
2535 static void
2536 dump_object(objset_t *os, uint64_t object, int verbosity,
2537 boolean_t *print_header, uint64_t *dnode_slots_used)
2538 {
2539 dmu_buf_t *db = NULL;
2540 dmu_object_info_t doi;
2541 dnode_t *dn;
2542 boolean_t dnode_held = B_FALSE;
2543 void *bonus = NULL;
2544 size_t bsize = 0;
2545 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
2546 char bonus_size[32];
2547 char aux[50];
2548 int error;
2549
2550 /* make sure nicenum has enough space */
2551 CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
2552 CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
2553 CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
2554 CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
2555 CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
2556
2557 if (*print_header) {
2558 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
2559 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
2560 "lsize", "%full", "type");
2561 *print_header = 0;
2562 }
2563
2564 if (object == 0) {
2565 dn = DMU_META_DNODE(os);
2566 dmu_object_info_from_dnode(dn, &doi);
2567 } else {
2568 /*
2569 * Encrypted datasets will have sensitive bonus buffers
2570 * encrypted. Therefore we cannot hold the bonus buffer and
2571 * must hold the dnode itself instead.
2572 */
2573 error = dmu_object_info(os, object, &doi);
2574 if (error)
2575 fatal("dmu_object_info() failed, errno %u", error);
2576
2577 if (os->os_encrypted &&
2578 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
2579 error = dnode_hold(os, object, FTAG, &dn);
2580 if (error)
2581 fatal("dnode_hold() failed, errno %u", error);
2582 dnode_held = B_TRUE;
2583 } else {
2584 error = dmu_bonus_hold(os, object, FTAG, &db);
2585 if (error)
2586 fatal("dmu_bonus_hold(%llu) failed, errno %u",
2587 object, error);
2588 bonus = db->db_data;
2589 bsize = db->db_size;
2590 dn = DB_DNODE((dmu_buf_impl_t *)db);
2591 }
2592 }
2593
2594 if (dnode_slots_used)
2595 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
2596
2597 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
2598 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
2599 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
2600 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
2601 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
2602 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
2603 (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
2604 doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
2605 doi.doi_max_offset);
2606
2607 aux[0] = '\0';
2608
2609 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
2610 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
2611 " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
2612 }
2613
2614 if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
2615 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
2616 " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
2617 }
2618
2619 (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
2620 (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
2621 asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
2622
2623 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
2624 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
2625 "", "", "", "", "", "", bonus_size, "bonus",
2626 zdb_ot_name(doi.doi_bonus_type));
2627 }
2628
2629 if (verbosity >= 4) {
2630 (void) printf("\tdnode flags: %s%s%s%s\n",
2631 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
2632 "USED_BYTES " : "",
2633 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
2634 "USERUSED_ACCOUNTED " : "",
2635 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
2636 "USEROBJUSED_ACCOUNTED " : "",
2637 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
2638 "SPILL_BLKPTR" : "");
2639 (void) printf("\tdnode maxblkid: %llu\n",
2640 (longlong_t)dn->dn_phys->dn_maxblkid);
2641
2642 if (!dnode_held) {
2643 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
2644 object, bonus, bsize);
2645 } else {
2646 (void) printf("\t\t(bonus encrypted)\n");
2647 }
2648
2649 if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) {
2650 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
2651 NULL, 0);
2652 } else {
2653 (void) printf("\t\t(object encrypted)\n");
2654 }
2655
2656 *print_header = B_TRUE;
2657 }
2658
2659 if (verbosity >= 5)
2660 dump_indirect(dn);
2661
2662 if (verbosity >= 5) {
2663 /*
2664 * Report the list of segments that comprise the object.
2665 */
2666 uint64_t start = 0;
2667 uint64_t end;
2668 uint64_t blkfill = 1;
2669 int minlvl = 1;
2670
2671 if (dn->dn_type == DMU_OT_DNODE) {
2672 minlvl = 0;
2673 blkfill = DNODES_PER_BLOCK;
2674 }
2675
2676 for (;;) {
2677 char segsize[32];
2678 /* make sure nicenum has enough space */
2679 CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
2680 error = dnode_next_offset(dn,
2681 0, &start, minlvl, blkfill, 0);
2682 if (error)
2683 break;
2684 end = start;
2685 error = dnode_next_offset(dn,
2686 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
2687 zdb_nicenum(end - start, segsize, sizeof (segsize));
2688 (void) printf("\t\tsegment [%016llx, %016llx)"
2689 " size %5s\n", (u_longlong_t)start,
2690 (u_longlong_t)end, segsize);
2691 if (error)
2692 break;
2693 start = end;
2694 }
2695 }
2696
2697 if (db != NULL)
2698 dmu_buf_rele(db, FTAG);
2699 if (dnode_held)
2700 dnode_rele(dn, FTAG);
2701 }
2702
2703 static void
2704 count_dir_mos_objects(dsl_dir_t *dd)
2705 {
2706 mos_obj_refd(dd->dd_object);
2707 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
2708 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
2709 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
2710 mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
2711
2712 /*
2713 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
2714 * Ignore the references after the first one.
2715 */
2716 mos_obj_refd_multiple(dd->dd_crypto_obj);
2717 }
2718
2719 static void
2720 count_ds_mos_objects(dsl_dataset_t *ds)
2721 {
2722 mos_obj_refd(ds->ds_object);
2723 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
2724 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
2725 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
2726 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
2727 mos_obj_refd(ds->ds_bookmarks_obj);
2728
2729 if (!dsl_dataset_is_snapshot(ds)) {
2730 count_dir_mos_objects(ds->ds_dir);
2731 }
2732 }
2733
2734 static const char *objset_types[DMU_OST_NUMTYPES] = {
2735 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
2736
2737 static void
2738 dump_objset(objset_t *os)
2739 {
2740 dmu_objset_stats_t dds;
2741 uint64_t object, object_count;
2742 uint64_t refdbytes, usedobjs, scratch;
2743 char numbuf[32];
2744 char blkbuf[BP_SPRINTF_LEN + 20];
2745 char osname[ZFS_MAX_DATASET_NAME_LEN];
2746 const char *type = "UNKNOWN";
2747 int verbosity = dump_opt['d'];
2748 boolean_t print_header;
2749 unsigned i;
2750 int error;
2751 uint64_t total_slots_used = 0;
2752 uint64_t max_slot_used = 0;
2753 uint64_t dnode_slots;
2754
2755 /* make sure nicenum has enough space */
2756 CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
2757
2758 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2759 dmu_objset_fast_stat(os, &dds);
2760 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2761
2762 print_header = B_TRUE;
2763
2764 if (dds.dds_type < DMU_OST_NUMTYPES)
2765 type = objset_types[dds.dds_type];
2766
2767 if (dds.dds_type == DMU_OST_META) {
2768 dds.dds_creation_txg = TXG_INITIAL;
2769 usedobjs = BP_GET_FILL(os->os_rootbp);
2770 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
2771 dd_used_bytes;
2772 } else {
2773 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
2774 }
2775
2776 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
2777
2778 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
2779
2780 if (verbosity >= 4) {
2781 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
2782 (void) snprintf_blkptr(blkbuf + strlen(blkbuf),
2783 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
2784 } else {
2785 blkbuf[0] = '\0';
2786 }
2787
2788 dmu_objset_name(os, osname);
2789
2790 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
2791 "%s, %llu objects%s%s\n",
2792 osname, type, (u_longlong_t)dmu_objset_id(os),
2793 (u_longlong_t)dds.dds_creation_txg,
2794 numbuf, (u_longlong_t)usedobjs, blkbuf,
2795 (dds.dds_inconsistent) ? " (inconsistent)" : "");
2796
2797 if (zopt_objects != 0) {
2798 for (i = 0; i < zopt_objects; i++) {
2799 dump_object(os, zopt_object[i], verbosity,
2800 &print_header, NULL);
2801 }
2802 (void) printf("\n");
2803 return;
2804 }
2805
2806 if (dump_opt['i'] != 0 || verbosity >= 2)
2807 dump_intent_log(dmu_objset_zil(os));
2808
2809 if (dmu_objset_ds(os) != NULL) {
2810 dsl_dataset_t *ds = dmu_objset_ds(os);
2811 dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
2812 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
2813 !dmu_objset_is_snapshot(os)) {
2814 dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
2815 if (verify_dd_livelist(os) != 0)
2816 fatal("livelist is incorrect");
2817 }
2818
2819 if (dsl_dataset_remap_deadlist_exists(ds)) {
2820 (void) printf("ds_remap_deadlist:\n");
2821 dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
2822 }
2823 count_ds_mos_objects(ds);
2824 }
2825
2826 if (dmu_objset_ds(os) != NULL)
2827 dump_bookmarks(os, verbosity);
2828
2829 if (verbosity < 2)
2830 return;
2831
2832 if (BP_IS_HOLE(os->os_rootbp))
2833 return;
2834
2835 dump_object(os, 0, verbosity, &print_header, NULL);
2836 object_count = 0;
2837 if (DMU_USERUSED_DNODE(os) != NULL &&
2838 DMU_USERUSED_DNODE(os)->dn_type != 0) {
2839 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
2840 NULL);
2841 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
2842 NULL);
2843 }
2844
2845 if (DMU_PROJECTUSED_DNODE(os) != NULL &&
2846 DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
2847 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
2848 &print_header, NULL);
2849
2850 object = 0;
2851 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
2852 dump_object(os, object, verbosity, &print_header, &dnode_slots);
2853 object_count++;
2854 total_slots_used += dnode_slots;
2855 max_slot_used = object + dnode_slots - 1;
2856 }
2857
2858 (void) printf("\n");
2859
2860 (void) printf(" Dnode slots:\n");
2861 (void) printf("\tTotal used: %10llu\n",
2862 (u_longlong_t)total_slots_used);
2863 (void) printf("\tMax used: %10llu\n",
2864 (u_longlong_t)max_slot_used);
2865 (void) printf("\tPercent empty: %10lf\n",
2866 (double)(max_slot_used - total_slots_used)*100 /
2867 (double)max_slot_used);
2868 (void) printf("\n");
2869
2870 if (error != ESRCH) {
2871 (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
2872 abort();
2873 }
2874
2875 ASSERT3U(object_count, ==, usedobjs);
2876
2877 if (leaked_objects != 0) {
2878 (void) printf("%d potentially leaked objects detected\n",
2879 leaked_objects);
2880 leaked_objects = 0;
2881 }
2882 }
2883
2884 static void
2885 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
2886 {
2887 time_t timestamp = ub->ub_timestamp;
2888
2889 (void) printf("%s", header ? header : "");
2890 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
2891 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
2892 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
2893 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
2894 (void) printf("\ttimestamp = %llu UTC = %s",
2895 (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
2896
2897 (void) printf("\tmmp_magic = %016llx\n",
2898 (u_longlong_t)ub->ub_mmp_magic);
2899 if (MMP_VALID(ub)) {
2900 (void) printf("\tmmp_delay = %0llu\n",
2901 (u_longlong_t)ub->ub_mmp_delay);
2902 if (MMP_SEQ_VALID(ub))
2903 (void) printf("\tmmp_seq = %u\n",
2904 (unsigned int) MMP_SEQ(ub));
2905 if (MMP_FAIL_INT_VALID(ub))
2906 (void) printf("\tmmp_fail = %u\n",
2907 (unsigned int) MMP_FAIL_INT(ub));
2908 if (MMP_INTERVAL_VALID(ub))
2909 (void) printf("\tmmp_write = %u\n",
2910 (unsigned int) MMP_INTERVAL(ub));
2911 /* After MMP_* to make summarize_uberblock_mmp cleaner */
2912 (void) printf("\tmmp_valid = %x\n",
2913 (unsigned int) ub->ub_mmp_config & 0xFF);
2914 }
2915
2916 if (dump_opt['u'] >= 4) {
2917 char blkbuf[BP_SPRINTF_LEN];
2918 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
2919 (void) printf("\trootbp = %s\n", blkbuf);
2920 }
2921 (void) printf("\tcheckpoint_txg = %llu\n",
2922 (u_longlong_t)ub->ub_checkpoint_txg);
2923 (void) printf("%s", footer ? footer : "");
2924 }
2925
2926 static void
2927 dump_config(spa_t *spa)
2928 {
2929 dmu_buf_t *db;
2930 size_t nvsize = 0;
2931 int error = 0;
2932
2933
2934 error = dmu_bonus_hold(spa->spa_meta_objset,
2935 spa->spa_config_object, FTAG, &db);
2936
2937 if (error == 0) {
2938 nvsize = *(uint64_t *)db->db_data;
2939 dmu_buf_rele(db, FTAG);
2940
2941 (void) printf("\nMOS Configuration:\n");
2942 dump_packed_nvlist(spa->spa_meta_objset,
2943 spa->spa_config_object, (void *)&nvsize, 1);
2944 } else {
2945 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
2946 (u_longlong_t)spa->spa_config_object, error);
2947 }
2948 }
2949
2950 static void
2951 dump_cachefile(const char *cachefile)
2952 {
2953 int fd;
2954 struct stat64 statbuf;
2955 char *buf;
2956 nvlist_t *config;
2957
2958 if ((fd = open64(cachefile, O_RDONLY)) < 0) {
2959 (void) printf("cannot open '%s': %s\n", cachefile,
2960 strerror(errno));
2961 exit(1);
2962 }
2963
2964 if (fstat64(fd, &statbuf) != 0) {
2965 (void) printf("failed to stat '%s': %s\n", cachefile,
2966 strerror(errno));
2967 exit(1);
2968 }
2969
2970 if ((buf = malloc(statbuf.st_size)) == NULL) {
2971 (void) fprintf(stderr, "failed to allocate %llu bytes\n",
2972 (u_longlong_t)statbuf.st_size);
2973 exit(1);
2974 }
2975
2976 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2977 (void) fprintf(stderr, "failed to read %llu bytes\n",
2978 (u_longlong_t)statbuf.st_size);
2979 exit(1);
2980 }
2981
2982 (void) close(fd);
2983
2984 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
2985 (void) fprintf(stderr, "failed to unpack nvlist\n");
2986 exit(1);
2987 }
2988
2989 free(buf);
2990
2991 dump_nvlist(config, 0);
2992
2993 nvlist_free(config);
2994 }
2995
2996 /*
2997 * ZFS label nvlist stats
2998 */
2999 typedef struct zdb_nvl_stats {
3000 int zns_list_count;
3001 int zns_leaf_count;
3002 size_t zns_leaf_largest;
3003 size_t zns_leaf_total;
3004 nvlist_t *zns_string;
3005 nvlist_t *zns_uint64;
3006 nvlist_t *zns_boolean;
3007 } zdb_nvl_stats_t;
3008
3009 static void
3010 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
3011 {
3012 nvlist_t *list, **array;
3013 nvpair_t *nvp = NULL;
3014 char *name;
3015 uint_t i, items;
3016
3017 stats->zns_list_count++;
3018
3019 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3020 name = nvpair_name(nvp);
3021
3022 switch (nvpair_type(nvp)) {
3023 case DATA_TYPE_STRING:
3024 fnvlist_add_string(stats->zns_string, name,
3025 fnvpair_value_string(nvp));
3026 break;
3027 case DATA_TYPE_UINT64:
3028 fnvlist_add_uint64(stats->zns_uint64, name,
3029 fnvpair_value_uint64(nvp));
3030 break;
3031 case DATA_TYPE_BOOLEAN:
3032 fnvlist_add_boolean(stats->zns_boolean, name);
3033 break;
3034 case DATA_TYPE_NVLIST:
3035 if (nvpair_value_nvlist(nvp, &list) == 0)
3036 collect_nvlist_stats(list, stats);
3037 break;
3038 case DATA_TYPE_NVLIST_ARRAY:
3039 if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
3040 break;
3041
3042 for (i = 0; i < items; i++) {
3043 collect_nvlist_stats(array[i], stats);
3044
3045 /* collect stats on leaf vdev */
3046 if (strcmp(name, "children") == 0) {
3047 size_t size;
3048
3049 (void) nvlist_size(array[i], &size,
3050 NV_ENCODE_XDR);
3051 stats->zns_leaf_total += size;
3052 if (size > stats->zns_leaf_largest)
3053 stats->zns_leaf_largest = size;
3054 stats->zns_leaf_count++;
3055 }
3056 }
3057 break;
3058 default:
3059 (void) printf("skip type %d!\n", (int)nvpair_type(nvp));
3060 }
3061 }
3062 }
3063
3064 static void
3065 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
3066 {
3067 zdb_nvl_stats_t stats = { 0 };
3068 size_t size, sum = 0, total;
3069 size_t noise;
3070
3071 /* requires nvlist with non-unique names for stat collection */
3072 VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
3073 VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
3074 VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
3075 VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
3076
3077 (void) printf("\n\nZFS Label NVList Config Stats:\n");
3078
3079 VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
3080 (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n",
3081 (int)total, (int)(cap - total), 100.0 * total / cap);
3082
3083 collect_nvlist_stats(nvl, &stats);
3084
3085 VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
3086 size -= noise;
3087 sum += size;
3088 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
3089 (int)fnvlist_num_pairs(stats.zns_uint64),
3090 (int)size, 100.0 * size / total);
3091
3092 VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
3093 size -= noise;
3094 sum += size;
3095 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
3096 (int)fnvlist_num_pairs(stats.zns_string),
3097 (int)size, 100.0 * size / total);
3098
3099 VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
3100 size -= noise;
3101 sum += size;
3102 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
3103 (int)fnvlist_num_pairs(stats.zns_boolean),
3104 (int)size, 100.0 * size / total);
3105
3106 size = total - sum; /* treat remainder as nvlist overhead */
3107 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
3108 stats.zns_list_count, (int)size, 100.0 * size / total);
3109
3110 if (stats.zns_leaf_count > 0) {
3111 size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
3112
3113 (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
3114 stats.zns_leaf_count, (int)average);
3115 (void) printf("%24d bytes largest\n",
3116 (int)stats.zns_leaf_largest);
3117
3118 if (dump_opt['l'] >= 3 && average > 0)
3119 (void) printf(" space for %d additional leaf vdevs\n",
3120 (int)((cap - total) / average));
3121 }
3122 (void) printf("\n");
3123
3124 nvlist_free(stats.zns_string);
3125 nvlist_free(stats.zns_uint64);
3126 nvlist_free(stats.zns_boolean);
3127 }
3128
3129 typedef struct cksum_record {
3130 zio_cksum_t cksum;
3131 boolean_t labels[VDEV_LABELS];
3132 avl_node_t link;
3133 } cksum_record_t;
3134
3135 static int
3136 cksum_record_compare(const void *x1, const void *x2)
3137 {
3138 const cksum_record_t *l = (cksum_record_t *)x1;
3139 const cksum_record_t *r = (cksum_record_t *)x2;
3140 int arraysize = ARRAY_SIZE(l->cksum.zc_word);
3141 int difference;
3142
3143 for (int i = 0; i < arraysize; i++) {
3144 difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
3145 if (difference)
3146 break;
3147 }
3148
3149 return (difference);
3150 }
3151
3152 static cksum_record_t *
3153 cksum_record_alloc(zio_cksum_t *cksum, int l)
3154 {
3155 cksum_record_t *rec;
3156
3157 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
3158 rec->cksum = *cksum;
3159 rec->labels[l] = B_TRUE;
3160
3161 return (rec);
3162 }
3163
3164 static cksum_record_t *
3165 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
3166 {
3167 cksum_record_t lookup = { .cksum = *cksum };
3168 avl_index_t where;
3169
3170 return (avl_find(tree, &lookup, &where));
3171 }
3172
3173 static cksum_record_t *
3174 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
3175 {
3176 cksum_record_t *rec;
3177
3178 rec = cksum_record_lookup(tree, cksum);
3179 if (rec) {
3180 rec->labels[l] = B_TRUE;
3181 } else {
3182 rec = cksum_record_alloc(cksum, l);
3183 avl_add(tree, rec);
3184 }
3185
3186 return (rec);
3187 }
3188
3189 static int
3190 first_label(cksum_record_t *rec)
3191 {
3192 for (int i = 0; i < VDEV_LABELS; i++)
3193 if (rec->labels[i])
3194 return (i);
3195
3196 return (-1);
3197 }
3198
3199 static void
3200 print_label_numbers(char *prefix, cksum_record_t *rec)
3201 {
3202 printf("%s", prefix);
3203 for (int i = 0; i < VDEV_LABELS; i++)
3204 if (rec->labels[i] == B_TRUE)
3205 printf("%d ", i);
3206 printf("\n");
3207 }
3208
3209 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
3210
3211 typedef struct zdb_label {
3212 vdev_label_t label;
3213 nvlist_t *config_nv;
3214 cksum_record_t *config;
3215 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
3216 boolean_t header_printed;
3217 boolean_t read_failed;
3218 } zdb_label_t;
3219
3220 static void
3221 print_label_header(zdb_label_t *label, int l)
3222 {
3223
3224 if (dump_opt['q'])
3225 return;
3226
3227 if (label->header_printed == B_TRUE)
3228 return;
3229
3230 (void) printf("------------------------------------\n");
3231 (void) printf("LABEL %d\n", l);
3232 (void) printf("------------------------------------\n");
3233
3234 label->header_printed = B_TRUE;
3235 }
3236
3237 static void
3238 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
3239 {
3240 if (dump_opt['q'])
3241 return;
3242
3243 if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
3244 return;
3245
3246 print_label_header(label, l);
3247 dump_nvlist(label->config_nv, 4);
3248 print_label_numbers(" labels = ", label->config);
3249
3250 if (dump_opt['l'] >= 2)
3251 dump_nvlist_stats(label->config_nv, buflen);
3252 }
3253
3254 #define ZDB_MAX_UB_HEADER_SIZE 32
3255
3256 static void
3257 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
3258 {
3259
3260 vdev_t vd;
3261 char header[ZDB_MAX_UB_HEADER_SIZE];
3262
3263 vd.vdev_ashift = ashift;
3264 vd.vdev_top = &vd;
3265
3266 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
3267 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
3268 uberblock_t *ub = (void *)((char *)&label->label + uoff);
3269 cksum_record_t *rec = label->uberblocks[i];
3270
3271 if (rec == NULL) {
3272 if (dump_opt['u'] >= 2) {
3273 print_label_header(label, label_num);
3274 (void) printf(" Uberblock[%d] invalid\n", i);
3275 }
3276 continue;
3277 }
3278
3279 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
3280 continue;
3281
3282 if ((dump_opt['u'] < 4) &&
3283 (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
3284 (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
3285 continue;
3286
3287 print_label_header(label, label_num);
3288 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
3289 " Uberblock[%d]\n", i);
3290 dump_uberblock(ub, header, "");
3291 print_label_numbers(" labels = ", rec);
3292 }
3293 }
3294
3295 static char curpath[PATH_MAX];
3296
3297 /*
3298 * Iterate through the path components, recursively passing
3299 * current one's obj and remaining path until we find the obj
3300 * for the last one.
3301 */
3302 static int
3303 dump_path_impl(objset_t *os, uint64_t obj, char *name)
3304 {
3305 int err;
3306 boolean_t header = B_TRUE;
3307 uint64_t child_obj;
3308 char *s;
3309 dmu_buf_t *db;
3310 dmu_object_info_t doi;
3311
3312 if ((s = strchr(name, '/')) != NULL)
3313 *s = '\0';
3314 err = zap_lookup(os, obj, name, 8, 1, &child_obj);
3315
3316 (void) strlcat(curpath, name, sizeof (curpath));
3317
3318 if (err != 0) {
3319 (void) fprintf(stderr, "failed to lookup %s: %s\n",
3320 curpath, strerror(err));
3321 return (err);
3322 }
3323
3324 child_obj = ZFS_DIRENT_OBJ(child_obj);
3325 err = sa_buf_hold(os, child_obj, FTAG, &db);
3326 if (err != 0) {
3327 (void) fprintf(stderr,
3328 "failed to get SA dbuf for obj %llu: %s\n",
3329 (u_longlong_t)child_obj, strerror(err));
3330 return (EINVAL);
3331 }
3332 dmu_object_info_from_db(db, &doi);
3333 sa_buf_rele(db, FTAG);
3334
3335 if (doi.doi_bonus_type != DMU_OT_SA &&
3336 doi.doi_bonus_type != DMU_OT_ZNODE) {
3337 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
3338 doi.doi_bonus_type, (u_longlong_t)child_obj);
3339 return (EINVAL);
3340 }
3341
3342 if (dump_opt['v'] > 6) {
3343 (void) printf("obj=%llu %s type=%d bonustype=%d\n",
3344 (u_longlong_t)child_obj, curpath, doi.doi_type,
3345 doi.doi_bonus_type);
3346 }
3347
3348 (void) strlcat(curpath, "/", sizeof (curpath));
3349
3350 switch (doi.doi_type) {
3351 case DMU_OT_DIRECTORY_CONTENTS:
3352 if (s != NULL && *(s + 1) != '\0')
3353 return (dump_path_impl(os, child_obj, s + 1));
3354 /*FALLTHROUGH*/
3355 case DMU_OT_PLAIN_FILE_CONTENTS:
3356 dump_object(os, child_obj, dump_opt['v'], &header, NULL);
3357 return (0);
3358 default:
3359 (void) fprintf(stderr, "object %llu has non-file/directory "
3360 "type %d\n", (u_longlong_t)obj, doi.doi_type);
3361 break;
3362 }
3363
3364 return (EINVAL);
3365 }
3366
3367 /*
3368 * Dump the blocks for the object specified by path inside the dataset.
3369 */
3370 static int
3371 dump_path(char *ds, char *path)
3372 {
3373 int err;
3374 objset_t *os;
3375 uint64_t root_obj;
3376
3377 err = open_objset(ds, FTAG, &os);
3378 if (err != 0)
3379 return (err);
3380
3381 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
3382 if (err != 0) {
3383 (void) fprintf(stderr, "can't lookup root znode: %s\n",
3384 strerror(err));
3385 close_objset(os, FTAG);
3386 return (EINVAL);
3387 }
3388
3389 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
3390
3391 err = dump_path_impl(os, root_obj, path);
3392
3393 close_objset(os, FTAG);
3394 return (err);
3395 }
3396
3397 static int
3398 dump_label(const char *dev)
3399 {
3400 char path[MAXPATHLEN];
3401 zdb_label_t labels[VDEV_LABELS];
3402 uint64_t psize, ashift;
3403 struct stat64 statbuf;
3404 boolean_t config_found = B_FALSE;
3405 boolean_t error = B_FALSE;
3406 avl_tree_t config_tree;
3407 avl_tree_t uberblock_tree;
3408 void *node, *cookie;
3409 int fd;
3410
3411 bzero(labels, sizeof (labels));
3412
3413 /*
3414 * Check if we were given absolute path and use it as is.
3415 * Otherwise if the provided vdev name doesn't point to a file,
3416 * try prepending expected disk paths and partition numbers.
3417 */
3418 (void) strlcpy(path, dev, sizeof (path));
3419 if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
3420 int error;
3421
3422 error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
3423 if (error == 0 && zfs_dev_is_whole_disk(path)) {
3424 if (zfs_append_partition(path, MAXPATHLEN) == -1)
3425 error = ENOENT;
3426 }
3427
3428 if (error || (stat64(path, &statbuf) != 0)) {
3429 (void) printf("failed to find device %s, try "
3430 "specifying absolute path instead\n", dev);
3431 return (1);
3432 }
3433 }
3434
3435 if ((fd = open64(path, O_RDONLY)) < 0) {
3436 (void) printf("cannot open '%s': %s\n", path, strerror(errno));
3437 exit(1);
3438 }
3439
3440 if (fstat64_blk(fd, &statbuf) != 0) {
3441 (void) printf("failed to stat '%s': %s\n", path,
3442 strerror(errno));
3443 (void) close(fd);
3444 exit(1);
3445 }
3446
3447 if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
3448 (void) printf("failed to invalidate cache '%s' : %s\n", path,
3449 strerror(errno));
3450
3451 avl_create(&config_tree, cksum_record_compare,
3452 sizeof (cksum_record_t), offsetof(cksum_record_t, link));
3453 avl_create(&uberblock_tree, cksum_record_compare,
3454 sizeof (cksum_record_t), offsetof(cksum_record_t, link));
3455
3456 psize = statbuf.st_size;
3457 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
3458 ashift = SPA_MINBLOCKSHIFT;
3459
3460 /*
3461 * 1. Read the label from disk
3462 * 2. Unpack the configuration and insert in config tree.
3463 * 3. Traverse all uberblocks and insert in uberblock tree.
3464 */
3465 for (int l = 0; l < VDEV_LABELS; l++) {
3466 zdb_label_t *label = &labels[l];
3467 char *buf = label->label.vl_vdev_phys.vp_nvlist;
3468 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
3469 nvlist_t *config;
3470 cksum_record_t *rec;
3471 zio_cksum_t cksum;
3472 vdev_t vd;
3473
3474 if (pread64(fd, &label->label, sizeof (label->label),
3475 vdev_label_offset(psize, l, 0)) != sizeof (label->label)) {
3476 if (!dump_opt['q'])
3477 (void) printf("failed to read label %d\n", l);
3478 label->read_failed = B_TRUE;
3479 error = B_TRUE;
3480 continue;
3481 }
3482
3483 label->read_failed = B_FALSE;
3484
3485 if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
3486 nvlist_t *vdev_tree = NULL;
3487 size_t size;
3488
3489 if ((nvlist_lookup_nvlist(config,
3490 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
3491 (nvlist_lookup_uint64(vdev_tree,
3492 ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
3493 ashift = SPA_MINBLOCKSHIFT;
3494
3495 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
3496 size = buflen;
3497
3498 fletcher_4_native_varsize(buf, size, &cksum);
3499 rec = cksum_record_insert(&config_tree, &cksum, l);
3500
3501 label->config = rec;
3502 label->config_nv = config;
3503 config_found = B_TRUE;
3504 } else {
3505 error = B_TRUE;
3506 }
3507
3508 vd.vdev_ashift = ashift;
3509 vd.vdev_top = &vd;
3510
3511 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
3512 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
3513 uberblock_t *ub = (void *)((char *)label + uoff);
3514
3515 if (uberblock_verify(ub))
3516 continue;
3517
3518 fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
3519 rec = cksum_record_insert(&uberblock_tree, &cksum, l);
3520
3521 label->uberblocks[i] = rec;
3522 }
3523 }
3524
3525 /*
3526 * Dump the label and uberblocks.
3527 */
3528 for (int l = 0; l < VDEV_LABELS; l++) {
3529 zdb_label_t *label = &labels[l];
3530 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
3531
3532 if (label->read_failed == B_TRUE)
3533 continue;
3534
3535 if (label->config_nv) {
3536 dump_config_from_label(label, buflen, l);
3537 } else {
3538 if (!dump_opt['q'])
3539 (void) printf("failed to unpack label %d\n", l);
3540 }
3541
3542 if (dump_opt['u'])
3543 dump_label_uberblocks(label, ashift, l);
3544
3545 nvlist_free(label->config_nv);
3546 }
3547
3548 cookie = NULL;
3549 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
3550 umem_free(node, sizeof (cksum_record_t));
3551
3552 cookie = NULL;
3553 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
3554 umem_free(node, sizeof (cksum_record_t));
3555
3556 avl_destroy(&config_tree);
3557 avl_destroy(&uberblock_tree);
3558
3559 (void) close(fd);
3560
3561 return (config_found == B_FALSE ? 2 :
3562 (error == B_TRUE ? 1 : 0));
3563 }
3564
3565 static uint64_t dataset_feature_count[SPA_FEATURES];
3566 static uint64_t global_feature_count[SPA_FEATURES];
3567 static uint64_t remap_deadlist_count = 0;
3568
3569 /*ARGSUSED*/
3570 static int
3571 dump_one_objset(const char *dsname, void *arg)
3572 {
3573 int error;
3574 objset_t *os;
3575 spa_feature_t f;
3576
3577 error = open_objset(dsname, FTAG, &os);
3578 if (error != 0)
3579 return (0);
3580
3581 for (f = 0; f < SPA_FEATURES; f++) {
3582 if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
3583 continue;
3584 ASSERT(spa_feature_table[f].fi_flags &
3585 ZFEATURE_FLAG_PER_DATASET);
3586 dataset_feature_count[f]++;
3587 }
3588
3589 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
3590 remap_deadlist_count++;
3591 }
3592
3593 for (dsl_bookmark_node_t *dbn =
3594 avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
3595 dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
3596 mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
3597 if (dbn->dbn_phys.zbm_redaction_obj != 0)
3598 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++;
3599 if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
3600 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
3601 }
3602
3603 if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
3604 !dmu_objset_is_snapshot(os)) {
3605 global_feature_count[SPA_FEATURE_LIVELIST]++;
3606 }
3607
3608 dump_objset(os);
3609 close_objset(os, FTAG);
3610 fuid_table_destroy();
3611 return (0);
3612 }
3613
3614 /*
3615 * Block statistics.
3616 */
3617 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
3618 typedef struct zdb_blkstats {
3619 uint64_t zb_asize;
3620 uint64_t zb_lsize;
3621 uint64_t zb_psize;
3622 uint64_t zb_count;
3623 uint64_t zb_gangs;
3624 uint64_t zb_ditto_samevdev;
3625 uint64_t zb_ditto_same_ms;
3626 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
3627 } zdb_blkstats_t;
3628
3629 /*
3630 * Extended object types to report deferred frees and dedup auto-ditto blocks.
3631 */
3632 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
3633 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
3634 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
3635 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
3636
3637 static const char *zdb_ot_extname[] = {
3638 "deferred free",
3639 "dedup ditto",
3640 "other",
3641 "Total",
3642 };
3643
3644 #define ZB_TOTAL DN_MAX_LEVELS
3645
3646 typedef struct zdb_cb {
3647 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
3648 uint64_t zcb_removing_size;
3649 uint64_t zcb_checkpoint_size;
3650 uint64_t zcb_dedup_asize;
3651 uint64_t zcb_dedup_blocks;
3652 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
3653 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
3654 [BPE_PAYLOAD_SIZE + 1];
3655 uint64_t zcb_start;
3656 hrtime_t zcb_lastprint;
3657 uint64_t zcb_totalasize;
3658 uint64_t zcb_errors[256];
3659 int zcb_readfails;
3660 int zcb_haderrors;
3661 spa_t *zcb_spa;
3662 uint32_t **zcb_vd_obsolete_counts;
3663 } zdb_cb_t;
3664
3665 /* test if two DVA offsets from same vdev are within the same metaslab */
3666 static boolean_t
3667 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
3668 {
3669 vdev_t *vd = vdev_lookup_top(spa, vdev);
3670 uint64_t ms_shift = vd->vdev_ms_shift;
3671
3672 return ((off1 >> ms_shift) == (off2 >> ms_shift));
3673 }
3674
3675 static void
3676 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
3677 dmu_object_type_t type)
3678 {
3679 uint64_t refcnt = 0;
3680 int i;
3681
3682 ASSERT(type < ZDB_OT_TOTAL);
3683
3684 if (zilog && zil_bp_tree_add(zilog, bp) != 0)
3685 return;
3686
3687 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
3688
3689 for (i = 0; i < 4; i++) {
3690 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
3691 int t = (i & 1) ? type : ZDB_OT_TOTAL;
3692 int equal;
3693 zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
3694
3695 zb->zb_asize += BP_GET_ASIZE(bp);
3696 zb->zb_lsize += BP_GET_LSIZE(bp);
3697 zb->zb_psize += BP_GET_PSIZE(bp);
3698 zb->zb_count++;
3699
3700 /*
3701 * The histogram is only big enough to record blocks up to
3702 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
3703 * "other", bucket.
3704 */
3705 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
3706 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
3707 zb->zb_psize_histogram[idx]++;
3708
3709 zb->zb_gangs += BP_COUNT_GANG(bp);
3710
3711 switch (BP_GET_NDVAS(bp)) {
3712 case 2:
3713 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3714 DVA_GET_VDEV(&bp->blk_dva[1])) {
3715 zb->zb_ditto_samevdev++;
3716
3717 if (same_metaslab(zcb->zcb_spa,
3718 DVA_GET_VDEV(&bp->blk_dva[0]),
3719 DVA_GET_OFFSET(&bp->blk_dva[0]),
3720 DVA_GET_OFFSET(&bp->blk_dva[1])))
3721 zb->zb_ditto_same_ms++;
3722 }
3723 break;
3724 case 3:
3725 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3726 DVA_GET_VDEV(&bp->blk_dva[1])) +
3727 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3728 DVA_GET_VDEV(&bp->blk_dva[2])) +
3729 (DVA_GET_VDEV(&bp->blk_dva[1]) ==
3730 DVA_GET_VDEV(&bp->blk_dva[2]));
3731 if (equal != 0) {
3732 zb->zb_ditto_samevdev++;
3733
3734 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3735 DVA_GET_VDEV(&bp->blk_dva[1]) &&
3736 same_metaslab(zcb->zcb_spa,
3737 DVA_GET_VDEV(&bp->blk_dva[0]),
3738 DVA_GET_OFFSET(&bp->blk_dva[0]),
3739 DVA_GET_OFFSET(&bp->blk_dva[1])))
3740 zb->zb_ditto_same_ms++;
3741 else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3742 DVA_GET_VDEV(&bp->blk_dva[2]) &&
3743 same_metaslab(zcb->zcb_spa,
3744 DVA_GET_VDEV(&bp->blk_dva[0]),
3745 DVA_GET_OFFSET(&bp->blk_dva[0]),
3746 DVA_GET_OFFSET(&bp->blk_dva[2])))
3747 zb->zb_ditto_same_ms++;
3748 else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
3749 DVA_GET_VDEV(&bp->blk_dva[2]) &&
3750 same_metaslab(zcb->zcb_spa,
3751 DVA_GET_VDEV(&bp->blk_dva[1]),
3752 DVA_GET_OFFSET(&bp->blk_dva[1]),
3753 DVA_GET_OFFSET(&bp->blk_dva[2])))
3754 zb->zb_ditto_same_ms++;
3755 }
3756 break;
3757 }
3758 }
3759
3760 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
3761
3762 if (BP_IS_EMBEDDED(bp)) {
3763 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
3764 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
3765 [BPE_GET_PSIZE(bp)]++;
3766 return;
3767 }
3768
3769 if (dump_opt['L'])
3770 return;
3771
3772 if (BP_GET_DEDUP(bp)) {
3773 ddt_t *ddt;
3774 ddt_entry_t *dde;
3775
3776 ddt = ddt_select(zcb->zcb_spa, bp);
3777 ddt_enter(ddt);
3778 dde = ddt_lookup(ddt, bp, B_FALSE);
3779
3780 if (dde == NULL) {
3781 refcnt = 0;
3782 } else {
3783 ddt_phys_t *ddp = ddt_phys_select(dde, bp);
3784 ddt_phys_decref(ddp);
3785 refcnt = ddp->ddp_refcnt;
3786 if (ddt_phys_total_refcnt(dde) == 0)
3787 ddt_remove(ddt, dde);
3788 }
3789 ddt_exit(ddt);
3790 }
3791
3792 VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
3793 refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
3794 bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
3795 }
3796
3797 static void
3798 zdb_blkptr_done(zio_t *zio)
3799 {
3800 spa_t *spa = zio->io_spa;
3801 blkptr_t *bp = zio->io_bp;
3802 int ioerr = zio->io_error;
3803 zdb_cb_t *zcb = zio->io_private;
3804 zbookmark_phys_t *zb = &zio->io_bookmark;
3805
3806 abd_free(zio->io_abd);
3807
3808 mutex_enter(&spa->spa_scrub_lock);
3809 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
3810 cv_broadcast(&spa->spa_scrub_io_cv);
3811
3812 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3813 char blkbuf[BP_SPRINTF_LEN];
3814
3815 zcb->zcb_haderrors = 1;
3816 zcb->zcb_errors[ioerr]++;
3817
3818 if (dump_opt['b'] >= 2)
3819 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3820 else
3821 blkbuf[0] = '\0';
3822
3823 (void) printf("zdb_blkptr_cb: "
3824 "Got error %d reading "
3825 "<%llu, %llu, %lld, %llx> %s -- skipping\n",
3826 ioerr,
3827 (u_longlong_t)zb->zb_objset,
3828 (u_longlong_t)zb->zb_object,
3829 (u_longlong_t)zb->zb_level,
3830 (u_longlong_t)zb->zb_blkid,
3831 blkbuf);
3832 }
3833 mutex_exit(&spa->spa_scrub_lock);
3834 }
3835
3836 static int
3837 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
3838 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
3839 {
3840 zdb_cb_t *zcb = arg;
3841 dmu_object_type_t type;
3842 boolean_t is_metadata;
3843
3844 if (zb->zb_level == ZB_DNODE_LEVEL)
3845 return (0);
3846
3847 if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
3848 char blkbuf[BP_SPRINTF_LEN];
3849 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3850 (void) printf("objset %llu object %llu "
3851 "level %lld offset 0x%llx %s\n",
3852 (u_longlong_t)zb->zb_objset,
3853 (u_longlong_t)zb->zb_object,
3854 (longlong_t)zb->zb_level,
3855 (u_longlong_t)blkid2offset(dnp, bp, zb),
3856 blkbuf);
3857 }
3858
3859 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
3860 return (0);
3861
3862 type = BP_GET_TYPE(bp);
3863
3864 zdb_count_block(zcb, zilog, bp,
3865 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
3866
3867 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
3868
3869 if (!BP_IS_EMBEDDED(bp) &&
3870 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
3871 size_t size = BP_GET_PSIZE(bp);
3872 abd_t *abd = abd_alloc(size, B_FALSE);
3873 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
3874
3875 /* If it's an intent log block, failure is expected. */
3876 if (zb->zb_level == ZB_ZIL_LEVEL)
3877 flags |= ZIO_FLAG_SPECULATIVE;
3878
3879 mutex_enter(&spa->spa_scrub_lock);
3880 while (spa->spa_load_verify_bytes > max_inflight_bytes)
3881 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3882 spa->spa_load_verify_bytes += size;
3883 mutex_exit(&spa->spa_scrub_lock);
3884
3885 zio_nowait(zio_read(NULL, spa, bp, abd, size,
3886 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
3887 }
3888
3889 zcb->zcb_readfails = 0;
3890
3891 /* only call gethrtime() every 100 blocks */
3892 static int iters;
3893 if (++iters > 100)
3894 iters = 0;
3895 else
3896 return (0);
3897
3898 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
3899 uint64_t now = gethrtime();
3900 char buf[10];
3901 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
3902 int kb_per_sec =
3903 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
3904 int sec_remaining =
3905 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
3906
3907 /* make sure nicenum has enough space */
3908 CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
3909
3910 zfs_nicebytes(bytes, buf, sizeof (buf));
3911 (void) fprintf(stderr,
3912 "\r%5s completed (%4dMB/s) "
3913 "estimated time remaining: %uhr %02umin %02usec ",
3914 buf, kb_per_sec / 1024,
3915 sec_remaining / 60 / 60,
3916 sec_remaining / 60 % 60,
3917 sec_remaining % 60);
3918
3919 zcb->zcb_lastprint = now;
3920 }
3921
3922 return (0);
3923 }
3924
3925 static void
3926 zdb_leak(void *arg, uint64_t start, uint64_t size)
3927 {
3928 vdev_t *vd = arg;
3929
3930 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
3931 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
3932 }
3933
3934 static metaslab_ops_t zdb_metaslab_ops = {
3935 NULL /* alloc */
3936 };
3937
3938 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme,
3939 uint64_t txg, void *arg);
3940
3941 typedef struct unflushed_iter_cb_arg {
3942 spa_t *uic_spa;
3943 uint64_t uic_txg;
3944 void *uic_arg;
3945 zdb_log_sm_cb_t uic_cb;
3946 } unflushed_iter_cb_arg_t;
3947
3948 static int
3949 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
3950 {
3951 unflushed_iter_cb_arg_t *uic = arg;
3952 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
3953 }
3954
3955 static void
3956 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
3957 {
3958 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3959 return;
3960
3961 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3962 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
3963 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
3964 space_map_t *sm = NULL;
3965 VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
3966 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
3967
3968 unflushed_iter_cb_arg_t uic = {
3969 .uic_spa = spa,
3970 .uic_txg = sls->sls_txg,
3971 .uic_arg = arg,
3972 .uic_cb = cb
3973 };
3974
3975 VERIFY0(space_map_iterate(sm, space_map_length(sm),
3976 iterate_through_spacemap_logs_cb, &uic));
3977 space_map_close(sm);
3978 }
3979 spa_config_exit(spa, SCL_CONFIG, FTAG);
3980 }
3981
3982 /* ARGSUSED */
3983 static int
3984 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
3985 uint64_t txg, void *arg)
3986 {
3987 spa_vdev_removal_t *svr = arg;
3988
3989 uint64_t offset = sme->sme_offset;
3990 uint64_t size = sme->sme_run;
3991
3992 /* skip vdevs we don't care about */
3993 if (sme->sme_vdev != svr->svr_vdev_id)
3994 return (0);
3995
3996 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
3997 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3998 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
3999
4000 if (txg < metaslab_unflushed_txg(ms))
4001 return (0);
4002
4003 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4004 ASSERT(vim != NULL);
4005 if (offset >= vdev_indirect_mapping_max_offset(vim))
4006 return (0);
4007
4008 if (sme->sme_type == SM_ALLOC)
4009 range_tree_add(svr->svr_allocd_segs, offset, size);
4010 else
4011 range_tree_remove(svr->svr_allocd_segs, offset, size);
4012
4013 return (0);
4014 }
4015
4016 /* ARGSUSED */
4017 static void
4018 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
4019 uint64_t size, void *arg)
4020 {
4021 /*
4022 * This callback was called through a remap from
4023 * a device being removed. Therefore, the vdev that
4024 * this callback is applied to is a concrete
4025 * vdev.
4026 */
4027 ASSERT(vdev_is_concrete(vd));
4028
4029 VERIFY0(metaslab_claim_impl(vd, offset, size,
4030 spa_min_claim_txg(vd->vdev_spa)));
4031 }
4032
4033 static void
4034 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
4035 {
4036 vdev_t *vd = arg;
4037
4038 vdev_indirect_ops.vdev_op_remap(vd, offset, size,
4039 claim_segment_impl_cb, NULL);
4040 }
4041
4042 /*
4043 * After accounting for all allocated blocks that are directly referenced,
4044 * we might have missed a reference to a block from a partially complete
4045 * (and thus unused) indirect mapping object. We perform a secondary pass
4046 * through the metaslabs we have already mapped and claim the destination
4047 * blocks.
4048 */
4049 static void
4050 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
4051 {
4052 if (dump_opt['L'])
4053 return;
4054
4055 if (spa->spa_vdev_removal == NULL)
4056 return;
4057
4058 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4059
4060 spa_vdev_removal_t *svr = spa->spa_vdev_removal;
4061 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
4062 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4063
4064 ASSERT0(range_tree_space(svr->svr_allocd_segs));
4065
4066 range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
4067 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
4068 metaslab_t *msp = vd->vdev_ms[msi];
4069
4070 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
4071 break;
4072
4073 ASSERT0(range_tree_space(allocs));
4074 if (msp->ms_sm != NULL)
4075 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
4076 range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
4077 }
4078 range_tree_destroy(allocs);
4079
4080 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
4081
4082 /*
4083 * Clear everything past what has been synced,
4084 * because we have not allocated mappings for
4085 * it yet.
4086 */
4087 range_tree_clear(svr->svr_allocd_segs,
4088 vdev_indirect_mapping_max_offset(vim),
4089 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
4090
4091 zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
4092 range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
4093
4094 spa_config_exit(spa, SCL_CONFIG, FTAG);
4095 }
4096
4097 /* ARGSUSED */
4098 static int
4099 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
4100 dmu_tx_t *tx)
4101 {
4102 zdb_cb_t *zcb = arg;
4103 spa_t *spa = zcb->zcb_spa;
4104 vdev_t *vd;
4105 const dva_t *dva = &bp->blk_dva[0];
4106
4107 ASSERT(!bp_freed);
4108 ASSERT(!dump_opt['L']);
4109 ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
4110
4111 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4112 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
4113 ASSERT3P(vd, !=, NULL);
4114 spa_config_exit(spa, SCL_VDEV, FTAG);
4115
4116 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
4117 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
4118
4119 vdev_indirect_mapping_increment_obsolete_count(
4120 vd->vdev_indirect_mapping,
4121 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
4122 zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
4123
4124 return (0);
4125 }
4126
4127 static uint32_t *
4128 zdb_load_obsolete_counts(vdev_t *vd)
4129 {
4130 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4131 spa_t *spa = vd->vdev_spa;
4132 spa_condensing_indirect_phys_t *scip =
4133 &spa->spa_condensing_indirect_phys;
4134 uint64_t obsolete_sm_object;
4135 uint32_t *counts;
4136
4137 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
4138 EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
4139 counts = vdev_indirect_mapping_load_obsolete_counts(vim);
4140 if (vd->vdev_obsolete_sm != NULL) {
4141 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
4142 vd->vdev_obsolete_sm);
4143 }
4144 if (scip->scip_vdev == vd->vdev_id &&
4145 scip->scip_prev_obsolete_sm_object != 0) {
4146 space_map_t *prev_obsolete_sm = NULL;
4147 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
4148 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
4149 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
4150 prev_obsolete_sm);
4151 space_map_close(prev_obsolete_sm);
4152 }
4153 return (counts);
4154 }
4155
4156 static void
4157 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
4158 {
4159 ddt_bookmark_t ddb;
4160 ddt_entry_t dde;
4161 int error;
4162 int p;
4163
4164 ASSERT(!dump_opt['L']);
4165
4166 bzero(&ddb, sizeof (ddb));
4167 while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
4168 blkptr_t blk;
4169 ddt_phys_t *ddp = dde.dde_phys;
4170
4171 if (ddb.ddb_class == DDT_CLASS_UNIQUE)
4172 return;
4173
4174 ASSERT(ddt_phys_total_refcnt(&dde) > 1);
4175
4176 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
4177 if (ddp->ddp_phys_birth == 0)
4178 continue;
4179 ddt_bp_create(ddb.ddb_checksum,
4180 &dde.dde_key, ddp, &blk);
4181 if (p == DDT_PHYS_DITTO) {
4182 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
4183 } else {
4184 zcb->zcb_dedup_asize +=
4185 BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
4186 zcb->zcb_dedup_blocks++;
4187 }
4188 }
4189 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
4190 ddt_enter(ddt);
4191 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
4192 ddt_exit(ddt);
4193 }
4194
4195 ASSERT(error == ENOENT);
4196 }
4197
4198 typedef struct checkpoint_sm_exclude_entry_arg {
4199 vdev_t *cseea_vd;
4200 uint64_t cseea_checkpoint_size;
4201 } checkpoint_sm_exclude_entry_arg_t;
4202
4203 static int
4204 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
4205 {
4206 checkpoint_sm_exclude_entry_arg_t *cseea = arg;
4207 vdev_t *vd = cseea->cseea_vd;
4208 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
4209 uint64_t end = sme->sme_offset + sme->sme_run;
4210
4211 ASSERT(sme->sme_type == SM_FREE);
4212
4213 /*
4214 * Since the vdev_checkpoint_sm exists in the vdev level
4215 * and the ms_sm space maps exist in the metaslab level,
4216 * an entry in the checkpoint space map could theoretically
4217 * cross the boundaries of the metaslab that it belongs.
4218 *
4219 * In reality, because of the way that we populate and
4220 * manipulate the checkpoint's space maps currently,
4221 * there shouldn't be any entries that cross metaslabs.
4222 * Hence the assertion below.
4223 *
4224 * That said, there is no fundamental requirement that
4225 * the checkpoint's space map entries should not cross
4226 * metaslab boundaries. So if needed we could add code
4227 * that handles metaslab-crossing segments in the future.
4228 */
4229 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
4230 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
4231
4232 /*
4233 * By removing the entry from the allocated segments we
4234 * also verify that the entry is there to begin with.
4235 */
4236 mutex_enter(&ms->ms_lock);
4237 range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
4238 mutex_exit(&ms->ms_lock);
4239
4240 cseea->cseea_checkpoint_size += sme->sme_run;
4241 return (0);
4242 }
4243
4244 static void
4245 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
4246 {
4247 spa_t *spa = vd->vdev_spa;
4248 space_map_t *checkpoint_sm = NULL;
4249 uint64_t checkpoint_sm_obj;
4250
4251 /*
4252 * If there is no vdev_top_zap, we are in a pool whose
4253 * version predates the pool checkpoint feature.
4254 */
4255 if (vd->vdev_top_zap == 0)
4256 return;
4257
4258 /*
4259 * If there is no reference of the vdev_checkpoint_sm in
4260 * the vdev_top_zap, then one of the following scenarios
4261 * is true:
4262 *
4263 * 1] There is no checkpoint
4264 * 2] There is a checkpoint, but no checkpointed blocks
4265 * have been freed yet
4266 * 3] The current vdev is indirect
4267 *
4268 * In these cases we return immediately.
4269 */
4270 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
4271 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4272 return;
4273
4274 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
4275 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
4276 &checkpoint_sm_obj));
4277
4278 checkpoint_sm_exclude_entry_arg_t cseea;
4279 cseea.cseea_vd = vd;
4280 cseea.cseea_checkpoint_size = 0;
4281
4282 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
4283 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
4284
4285 VERIFY0(space_map_iterate(checkpoint_sm,
4286 space_map_length(checkpoint_sm),
4287 checkpoint_sm_exclude_entry_cb, &cseea));
4288 space_map_close(checkpoint_sm);
4289
4290 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
4291 }
4292
4293 static void
4294 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
4295 {
4296 ASSERT(!dump_opt['L']);
4297
4298 vdev_t *rvd = spa->spa_root_vdev;
4299 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
4300 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
4301 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
4302 }
4303 }
4304
4305 static int
4306 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
4307 uint64_t txg, void *arg)
4308 {
4309 int64_t *ualloc_space = arg;
4310
4311 uint64_t offset = sme->sme_offset;
4312 uint64_t vdev_id = sme->sme_vdev;
4313
4314 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
4315 if (!vdev_is_concrete(vd))
4316 return (0);
4317
4318 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4319 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
4320
4321 if (txg < metaslab_unflushed_txg(ms))
4322 return (0);
4323
4324 if (sme->sme_type == SM_ALLOC)
4325 *ualloc_space += sme->sme_run;
4326 else
4327 *ualloc_space -= sme->sme_run;
4328
4329 return (0);
4330 }
4331
4332 static int64_t
4333 get_unflushed_alloc_space(spa_t *spa)
4334 {
4335 if (dump_opt['L'])
4336 return (0);
4337
4338 int64_t ualloc_space = 0;
4339 iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
4340 &ualloc_space);
4341 return (ualloc_space);
4342 }
4343
4344 static int
4345 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
4346 {
4347 maptype_t *uic_maptype = arg;
4348
4349 uint64_t offset = sme->sme_offset;
4350 uint64_t size = sme->sme_run;
4351 uint64_t vdev_id = sme->sme_vdev;
4352
4353 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
4354
4355 /* skip indirect vdevs */
4356 if (!vdev_is_concrete(vd))
4357 return (0);
4358
4359 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4360
4361 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
4362 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
4363
4364 if (txg < metaslab_unflushed_txg(ms))
4365 return (0);
4366
4367 if (*uic_maptype == sme->sme_type)
4368 range_tree_add(ms->ms_allocatable, offset, size);
4369 else
4370 range_tree_remove(ms->ms_allocatable, offset, size);
4371
4372 return (0);
4373 }
4374
4375 static void
4376 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
4377 {
4378 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
4379 }
4380
4381 static void
4382 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
4383 {
4384 vdev_t *rvd = spa->spa_root_vdev;
4385 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
4386 vdev_t *vd = rvd->vdev_child[i];
4387
4388 ASSERT3U(i, ==, vd->vdev_id);
4389
4390 if (vd->vdev_ops == &vdev_indirect_ops)
4391 continue;
4392
4393 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
4394 metaslab_t *msp = vd->vdev_ms[m];
4395
4396 (void) fprintf(stderr,
4397 "\rloading concrete vdev %llu, "
4398 "metaslab %llu of %llu ...",
4399 (longlong_t)vd->vdev_id,
4400 (longlong_t)msp->ms_id,
4401 (longlong_t)vd->vdev_ms_count);
4402
4403 mutex_enter(&msp->ms_lock);
4404 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
4405
4406 /*
4407 * We don't want to spend the CPU manipulating the
4408 * size-ordered tree, so clear the range_tree ops.
4409 */
4410 msp->ms_allocatable->rt_ops = NULL;
4411
4412 if (msp->ms_sm != NULL) {
4413 VERIFY0(space_map_load(msp->ms_sm,
4414 msp->ms_allocatable, maptype));
4415 }
4416 if (!msp->ms_loaded)
4417 msp->ms_loaded = B_TRUE;
4418 mutex_exit(&msp->ms_lock);
4419 }
4420 }
4421
4422 load_unflushed_to_ms_allocatables(spa, maptype);
4423 }
4424
4425 /*
4426 * vm_idxp is an in-out parameter which (for indirect vdevs) is the
4427 * index in vim_entries that has the first entry in this metaslab.
4428 * On return, it will be set to the first entry after this metaslab.
4429 */
4430 static void
4431 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
4432 uint64_t *vim_idxp)
4433 {
4434 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4435
4436 mutex_enter(&msp->ms_lock);
4437 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
4438
4439 /*
4440 * We don't want to spend the CPU manipulating the
4441 * size-ordered tree, so clear the range_tree ops.
4442 */
4443 msp->ms_allocatable->rt_ops = NULL;
4444
4445 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
4446 (*vim_idxp)++) {
4447 vdev_indirect_mapping_entry_phys_t *vimep =
4448 &vim->vim_entries[*vim_idxp];
4449 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
4450 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
4451 ASSERT3U(ent_offset, >=, msp->ms_start);
4452 if (ent_offset >= msp->ms_start + msp->ms_size)
4453 break;
4454
4455 /*
4456 * Mappings do not cross metaslab boundaries,
4457 * because we create them by walking the metaslabs.
4458 */
4459 ASSERT3U(ent_offset + ent_len, <=,
4460 msp->ms_start + msp->ms_size);
4461 range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
4462 }
4463
4464 if (!msp->ms_loaded)
4465 msp->ms_loaded = B_TRUE;
4466 mutex_exit(&msp->ms_lock);
4467 }
4468
4469 static void
4470 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
4471 {
4472 ASSERT(!dump_opt['L']);
4473
4474 vdev_t *rvd = spa->spa_root_vdev;
4475 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
4476 vdev_t *vd = rvd->vdev_child[c];
4477
4478 ASSERT3U(c, ==, vd->vdev_id);
4479
4480 if (vd->vdev_ops != &vdev_indirect_ops)
4481 continue;
4482
4483 /*
4484 * Note: we don't check for mapping leaks on
4485 * removing vdevs because their ms_allocatable's
4486 * are used to look for leaks in allocated space.
4487 */
4488 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
4489
4490 /*
4491 * Normally, indirect vdevs don't have any
4492 * metaslabs. We want to set them up for
4493 * zio_claim().
4494 */
4495 VERIFY0(vdev_metaslab_init(vd, 0));
4496
4497 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4498 uint64_t vim_idx = 0;
4499 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
4500
4501 (void) fprintf(stderr,
4502 "\rloading indirect vdev %llu, "
4503 "metaslab %llu of %llu ...",
4504 (longlong_t)vd->vdev_id,
4505 (longlong_t)vd->vdev_ms[m]->ms_id,
4506 (longlong_t)vd->vdev_ms_count);
4507
4508 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
4509 &vim_idx);
4510 }
4511 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
4512 }
4513 }
4514
4515 static void
4516 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
4517 {
4518 zcb->zcb_spa = spa;
4519
4520 if (dump_opt['L'])
4521 return;
4522
4523 dsl_pool_t *dp = spa->spa_dsl_pool;
4524 vdev_t *rvd = spa->spa_root_vdev;
4525
4526 /*
4527 * We are going to be changing the meaning of the metaslab's
4528 * ms_allocatable. Ensure that the allocator doesn't try to
4529 * use the tree.
4530 */
4531 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
4532 spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
4533
4534 zcb->zcb_vd_obsolete_counts =
4535 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
4536 UMEM_NOFAIL);
4537
4538 /*
4539 * For leak detection, we overload the ms_allocatable trees
4540 * to contain allocated segments instead of free segments.
4541 * As a result, we can't use the normal metaslab_load/unload
4542 * interfaces.
4543 */
4544 zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
4545 load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
4546
4547 /*
4548 * On load_concrete_ms_allocatable_trees() we loaded all the
4549 * allocated entries from the ms_sm to the ms_allocatable for
4550 * each metaslab. If the pool has a checkpoint or is in the
4551 * middle of discarding a checkpoint, some of these blocks
4552 * may have been freed but their ms_sm may not have been
4553 * updated because they are referenced by the checkpoint. In
4554 * order to avoid false-positives during leak-detection, we
4555 * go through the vdev's checkpoint space map and exclude all
4556 * its entries from their relevant ms_allocatable.
4557 *
4558 * We also aggregate the space held by the checkpoint and add
4559 * it to zcb_checkpoint_size.
4560 *
4561 * Note that at this point we are also verifying that all the
4562 * entries on the checkpoint_sm are marked as allocated in
4563 * the ms_sm of their relevant metaslab.
4564 * [see comment in checkpoint_sm_exclude_entry_cb()]
4565 */
4566 zdb_leak_init_exclude_checkpoint(spa, zcb);
4567 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
4568
4569 /* for cleaner progress output */
4570 (void) fprintf(stderr, "\n");
4571
4572 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
4573 ASSERT(spa_feature_is_enabled(spa,
4574 SPA_FEATURE_DEVICE_REMOVAL));
4575 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
4576 increment_indirect_mapping_cb, zcb, NULL);
4577 }
4578
4579 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4580 zdb_ddt_leak_init(spa, zcb);
4581 spa_config_exit(spa, SCL_CONFIG, FTAG);
4582 }
4583
4584 static boolean_t
4585 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
4586 {
4587 boolean_t leaks = B_FALSE;
4588 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4589 uint64_t total_leaked = 0;
4590 boolean_t are_precise = B_FALSE;
4591
4592 ASSERT(vim != NULL);
4593
4594 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
4595 vdev_indirect_mapping_entry_phys_t *vimep =
4596 &vim->vim_entries[i];
4597 uint64_t obsolete_bytes = 0;
4598 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
4599 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4600
4601 /*
4602 * This is not very efficient but it's easy to
4603 * verify correctness.
4604 */
4605 for (uint64_t inner_offset = 0;
4606 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
4607 inner_offset += 1 << vd->vdev_ashift) {
4608 if (range_tree_contains(msp->ms_allocatable,
4609 offset + inner_offset, 1 << vd->vdev_ashift)) {
4610 obsolete_bytes += 1 << vd->vdev_ashift;
4611 }
4612 }
4613
4614 int64_t bytes_leaked = obsolete_bytes -
4615 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
4616 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
4617 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
4618
4619 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
4620 if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
4621 (void) printf("obsolete indirect mapping count "
4622 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
4623 (u_longlong_t)vd->vdev_id,
4624 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
4625 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
4626 (u_longlong_t)bytes_leaked);
4627 }
4628 total_leaked += ABS(bytes_leaked);
4629 }
4630
4631 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
4632 if (!are_precise && total_leaked > 0) {
4633 int pct_leaked = total_leaked * 100 /
4634 vdev_indirect_mapping_bytes_mapped(vim);
4635 (void) printf("cannot verify obsolete indirect mapping "
4636 "counts of vdev %llu because precise feature was not "
4637 "enabled when it was removed: %d%% (%llx bytes) of mapping"
4638 "unreferenced\n",
4639 (u_longlong_t)vd->vdev_id, pct_leaked,
4640 (u_longlong_t)total_leaked);
4641 } else if (total_leaked > 0) {
4642 (void) printf("obsolete indirect mapping count mismatch "
4643 "for vdev %llu -- %llx total bytes mismatched\n",
4644 (u_longlong_t)vd->vdev_id,
4645 (u_longlong_t)total_leaked);
4646 leaks |= B_TRUE;
4647 }
4648
4649 vdev_indirect_mapping_free_obsolete_counts(vim,
4650 zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
4651 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
4652
4653 return (leaks);
4654 }
4655
4656 static boolean_t
4657 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
4658 {
4659 if (dump_opt['L'])
4660 return (B_FALSE);
4661
4662 boolean_t leaks = B_FALSE;
4663 vdev_t *rvd = spa->spa_root_vdev;
4664 for (unsigned c = 0; c < rvd->vdev_children; c++) {
4665 vdev_t *vd = rvd->vdev_child[c];
4666 ASSERTV(metaslab_group_t *mg = vd->vdev_mg);
4667
4668 if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
4669 leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
4670 }
4671
4672 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
4673 metaslab_t *msp = vd->vdev_ms[m];
4674 ASSERT3P(mg, ==, msp->ms_group);
4675
4676 /*
4677 * ms_allocatable has been overloaded
4678 * to contain allocated segments. Now that
4679 * we finished traversing all blocks, any
4680 * block that remains in the ms_allocatable
4681 * represents an allocated block that we
4682 * did not claim during the traversal.
4683 * Claimed blocks would have been removed
4684 * from the ms_allocatable. For indirect
4685 * vdevs, space remaining in the tree
4686 * represents parts of the mapping that are
4687 * not referenced, which is not a bug.
4688 */
4689 if (vd->vdev_ops == &vdev_indirect_ops) {
4690 range_tree_vacate(msp->ms_allocatable,
4691 NULL, NULL);
4692 } else {
4693 range_tree_vacate(msp->ms_allocatable,
4694 zdb_leak, vd);
4695 }
4696 if (msp->ms_loaded) {
4697 msp->ms_loaded = B_FALSE;
4698 }
4699 }
4700 }
4701
4702 umem_free(zcb->zcb_vd_obsolete_counts,
4703 rvd->vdev_children * sizeof (uint32_t *));
4704 zcb->zcb_vd_obsolete_counts = NULL;
4705
4706 return (leaks);
4707 }
4708
4709 /* ARGSUSED */
4710 static int
4711 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
4712 {
4713 zdb_cb_t *zcb = arg;
4714
4715 if (dump_opt['b'] >= 5) {
4716 char blkbuf[BP_SPRINTF_LEN];
4717 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
4718 (void) printf("[%s] %s\n",
4719 "deferred free", blkbuf);
4720 }
4721 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
4722 return (0);
4723 }
4724
4725 /*
4726 * Iterate over livelists which have been destroyed by the user but
4727 * are still present in the MOS, waiting to be freed
4728 */
4729 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
4730
4731 static void
4732 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
4733 {
4734 objset_t *mos = spa->spa_meta_objset;
4735 uint64_t zap_obj;
4736 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
4737 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
4738 if (err == ENOENT)
4739 return;
4740 ASSERT0(err);
4741
4742 zap_cursor_t zc;
4743 zap_attribute_t attr;
4744 dsl_deadlist_t ll;
4745 /* NULL out os prior to dsl_deadlist_open in case it's garbage */
4746 ll.dl_os = NULL;
4747 for (zap_cursor_init(&zc, mos, zap_obj);
4748 zap_cursor_retrieve(&zc, &attr) == 0;
4749 (void) zap_cursor_advance(&zc)) {
4750 dsl_deadlist_open(&ll, mos, attr.za_first_integer);
4751 func(&ll, arg);
4752 dsl_deadlist_close(&ll);
4753 }
4754 zap_cursor_fini(&zc);
4755 }
4756
4757 static int
4758 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
4759 dmu_tx_t *tx)
4760 {
4761 ASSERT(!bp_freed);
4762 return (count_block_cb(arg, bp, tx));
4763 }
4764
4765 static int
4766 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
4767 {
4768 zdb_cb_t *zbc = args;
4769 bplist_t blks;
4770 bplist_create(&blks);
4771 /* determine which blocks have been alloc'd but not freed */
4772 VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
4773 /* count those blocks */
4774 (void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
4775 bplist_destroy(&blks);
4776 return (0);
4777 }
4778
4779 static void
4780 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
4781 {
4782 dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
4783 }
4784
4785 /*
4786 * Count the blocks in the livelists that have been destroyed by the user
4787 * but haven't yet been freed.
4788 */
4789 static void
4790 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
4791 {
4792 iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
4793 }
4794
4795 static void
4796 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
4797 {
4798 ASSERT3P(arg, ==, NULL);
4799 global_feature_count[SPA_FEATURE_LIVELIST]++;
4800 dump_blkptr_list(ll, "Deleted Livelist");
4801 }
4802
4803 /*
4804 * Print out, register object references to, and increment feature counts for
4805 * livelists that have been destroyed by the user but haven't yet been freed.
4806 */
4807 static void
4808 deleted_livelists_dump_mos(spa_t *spa)
4809 {
4810 uint64_t zap_obj;
4811 objset_t *mos = spa->spa_meta_objset;
4812 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
4813 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
4814 if (err == ENOENT)
4815 return;
4816 mos_obj_refd(zap_obj);
4817 iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
4818 }
4819
4820 static int
4821 dump_block_stats(spa_t *spa)
4822 {
4823 zdb_cb_t zcb;
4824 zdb_blkstats_t *zb, *tzb;
4825 uint64_t norm_alloc, norm_space, total_alloc, total_found;
4826 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
4827 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
4828 boolean_t leaks = B_FALSE;
4829 int e, c, err;
4830 bp_embedded_type_t i;
4831
4832 bzero(&zcb, sizeof (zcb));
4833 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
4834 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
4835 (dump_opt['c'] == 1) ? "metadata " : "",
4836 dump_opt['c'] ? "checksums " : "",
4837 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
4838 !dump_opt['L'] ? "nothing leaked " : "");
4839
4840 /*
4841 * When leak detection is enabled we load all space maps as SM_ALLOC
4842 * maps, then traverse the pool claiming each block we discover. If
4843 * the pool is perfectly consistent, the segment trees will be empty
4844 * when we're done. Anything left over is a leak; any block we can't
4845 * claim (because it's not part of any space map) is a double
4846 * allocation, reference to a freed block, or an unclaimed log block.
4847 *
4848 * When leak detection is disabled (-L option) we still traverse the
4849 * pool claiming each block we discover, but we skip opening any space
4850 * maps.
4851 */
4852 bzero(&zcb, sizeof (zdb_cb_t));
4853 zdb_leak_init(spa, &zcb);
4854
4855 /*
4856 * If there's a deferred-free bplist, process that first.
4857 */
4858 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
4859 bpobj_count_block_cb, &zcb, NULL);
4860
4861 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
4862 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
4863 bpobj_count_block_cb, &zcb, NULL);
4864 }
4865
4866 zdb_claim_removing(spa, &zcb);
4867
4868 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
4869 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
4870 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
4871 &zcb, NULL));
4872 }
4873
4874 deleted_livelists_count_blocks(spa, &zcb);
4875
4876 if (dump_opt['c'] > 1)
4877 flags |= TRAVERSE_PREFETCH_DATA;
4878
4879 zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
4880 zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
4881 zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
4882 zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
4883 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
4884
4885 /*
4886 * If we've traversed the data blocks then we need to wait for those
4887 * I/Os to complete. We leverage "The Godfather" zio to wait on
4888 * all async I/Os to complete.
4889 */
4890 if (dump_opt['c']) {
4891 for (c = 0; c < max_ncpus; c++) {
4892 (void) zio_wait(spa->spa_async_zio_root[c]);
4893 spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
4894 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4895 ZIO_FLAG_GODFATHER);
4896 }
4897 }
4898 ASSERT0(spa->spa_load_verify_bytes);
4899
4900 /*
4901 * Done after zio_wait() since zcb_haderrors is modified in
4902 * zdb_blkptr_done()
4903 */
4904 zcb.zcb_haderrors |= err;
4905
4906 if (zcb.zcb_haderrors) {
4907 (void) printf("\nError counts:\n\n");
4908 (void) printf("\t%5s %s\n", "errno", "count");
4909 for (e = 0; e < 256; e++) {
4910 if (zcb.zcb_errors[e] != 0) {
4911 (void) printf("\t%5d %llu\n",
4912 e, (u_longlong_t)zcb.zcb_errors[e]);
4913 }
4914 }
4915 }
4916
4917 /*
4918 * Report any leaked segments.
4919 */
4920 leaks |= zdb_leak_fini(spa, &zcb);
4921
4922 tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
4923
4924 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
4925 norm_space = metaslab_class_get_space(spa_normal_class(spa));
4926
4927 total_alloc = norm_alloc +
4928 metaslab_class_get_alloc(spa_log_class(spa)) +
4929 metaslab_class_get_alloc(spa_special_class(spa)) +
4930 metaslab_class_get_alloc(spa_dedup_class(spa)) +
4931 get_unflushed_alloc_space(spa);
4932 total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
4933 zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
4934
4935 if (total_found == total_alloc && !dump_opt['L']) {
4936 (void) printf("\n\tNo leaks (block sum matches space"
4937 " maps exactly)\n");
4938 } else if (!dump_opt['L']) {
4939 (void) printf("block traversal size %llu != alloc %llu "
4940 "(%s %lld)\n",
4941 (u_longlong_t)total_found,
4942 (u_longlong_t)total_alloc,
4943 (dump_opt['L']) ? "unreachable" : "leaked",
4944 (longlong_t)(total_alloc - total_found));
4945 leaks = B_TRUE;
4946 }
4947
4948 if (tzb->zb_count == 0)
4949 return (2);
4950
4951 (void) printf("\n");
4952 (void) printf("\t%-16s %14llu\n", "bp count:",
4953 (u_longlong_t)tzb->zb_count);
4954 (void) printf("\t%-16s %14llu\n", "ganged count:",
4955 (longlong_t)tzb->zb_gangs);
4956 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
4957 (u_longlong_t)tzb->zb_lsize,
4958 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
4959 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
4960 "bp physical:", (u_longlong_t)tzb->zb_psize,
4961 (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
4962 (double)tzb->zb_lsize / tzb->zb_psize);
4963 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
4964 "bp allocated:", (u_longlong_t)tzb->zb_asize,
4965 (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
4966 (double)tzb->zb_lsize / tzb->zb_asize);
4967 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
4968 "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
4969 (u_longlong_t)zcb.zcb_dedup_blocks,
4970 (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
4971 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
4972 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
4973
4974 if (spa_special_class(spa)->mc_rotor != NULL) {
4975 uint64_t alloc = metaslab_class_get_alloc(
4976 spa_special_class(spa));
4977 uint64_t space = metaslab_class_get_space(
4978 spa_special_class(spa));
4979
4980 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
4981 "Special class", (u_longlong_t)alloc,
4982 100.0 * alloc / space);
4983 }
4984
4985 if (spa_dedup_class(spa)->mc_rotor != NULL) {
4986 uint64_t alloc = metaslab_class_get_alloc(
4987 spa_dedup_class(spa));
4988 uint64_t space = metaslab_class_get_space(
4989 spa_dedup_class(spa));
4990
4991 (void) printf("\t%-16s %14llu used: %5.2f%%\n",
4992 "Dedup class", (u_longlong_t)alloc,
4993 100.0 * alloc / space);
4994 }
4995
4996 for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
4997 if (zcb.zcb_embedded_blocks[i] == 0)
4998 continue;
4999 (void) printf("\n");
5000 (void) printf("\tadditional, non-pointer bps of type %u: "
5001 "%10llu\n",
5002 i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
5003
5004 if (dump_opt['b'] >= 3) {
5005 (void) printf("\t number of (compressed) bytes: "
5006 "number of bps\n");
5007 dump_histogram(zcb.zcb_embedded_histogram[i],
5008 sizeof (zcb.zcb_embedded_histogram[i]) /
5009 sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
5010 }
5011 }
5012
5013 if (tzb->zb_ditto_samevdev != 0) {
5014 (void) printf("\tDittoed blocks on same vdev: %llu\n",
5015 (longlong_t)tzb->zb_ditto_samevdev);
5016 }
5017 if (tzb->zb_ditto_same_ms != 0) {
5018 (void) printf("\tDittoed blocks in same metaslab: %llu\n",
5019 (longlong_t)tzb->zb_ditto_same_ms);
5020 }
5021
5022 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
5023 vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
5024 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
5025
5026 if (vim == NULL) {
5027 continue;
5028 }
5029
5030 char mem[32];
5031 zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
5032 mem, vdev_indirect_mapping_size(vim));
5033
5034 (void) printf("\tindirect vdev id %llu has %llu segments "
5035 "(%s in memory)\n",
5036 (longlong_t)vd->vdev_id,
5037 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
5038 }
5039
5040 if (dump_opt['b'] >= 2) {
5041 int l, t, level;
5042 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
5043 "\t avg\t comp\t%%Total\tType\n");
5044
5045 for (t = 0; t <= ZDB_OT_TOTAL; t++) {
5046 char csize[32], lsize[32], psize[32], asize[32];
5047 char avg[32], gang[32];
5048 const char *typename;
5049
5050 /* make sure nicenum has enough space */
5051 CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
5052 CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
5053 CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
5054 CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
5055 CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
5056 CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
5057
5058 if (t < DMU_OT_NUMTYPES)
5059 typename = dmu_ot[t].ot_name;
5060 else
5061 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
5062
5063 if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
5064 (void) printf("%6s\t%5s\t%5s\t%5s"
5065 "\t%5s\t%5s\t%6s\t%s\n",
5066 "-",
5067 "-",
5068 "-",
5069 "-",
5070 "-",
5071 "-",
5072 "-",
5073 typename);
5074 continue;
5075 }
5076
5077 for (l = ZB_TOTAL - 1; l >= -1; l--) {
5078 level = (l == -1 ? ZB_TOTAL : l);
5079 zb = &zcb.zcb_type[level][t];
5080
5081 if (zb->zb_asize == 0)
5082 continue;
5083
5084 if (dump_opt['b'] < 3 && level != ZB_TOTAL)
5085 continue;
5086
5087 if (level == 0 && zb->zb_asize ==
5088 zcb.zcb_type[ZB_TOTAL][t].zb_asize)
5089 continue;
5090
5091 zdb_nicenum(zb->zb_count, csize,
5092 sizeof (csize));
5093 zdb_nicenum(zb->zb_lsize, lsize,
5094 sizeof (lsize));
5095 zdb_nicenum(zb->zb_psize, psize,
5096 sizeof (psize));
5097 zdb_nicenum(zb->zb_asize, asize,
5098 sizeof (asize));
5099 zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
5100 sizeof (avg));
5101 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
5102
5103 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
5104 "\t%5.2f\t%6.2f\t",
5105 csize, lsize, psize, asize, avg,
5106 (double)zb->zb_lsize / zb->zb_psize,
5107 100.0 * zb->zb_asize / tzb->zb_asize);
5108
5109 if (level == ZB_TOTAL)
5110 (void) printf("%s\n", typename);
5111 else
5112 (void) printf(" L%d %s\n",
5113 level, typename);
5114
5115 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
5116 (void) printf("\t number of ganged "
5117 "blocks: %s\n", gang);
5118 }
5119
5120 if (dump_opt['b'] >= 4) {
5121 (void) printf("psize "
5122 "(in 512-byte sectors): "
5123 "number of blocks\n");
5124 dump_histogram(zb->zb_psize_histogram,
5125 PSIZE_HISTO_SIZE, 0);
5126 }
5127 }
5128 }
5129 }
5130
5131 (void) printf("\n");
5132
5133 if (leaks)
5134 return (2);
5135
5136 if (zcb.zcb_haderrors)
5137 return (3);
5138
5139 return (0);
5140 }
5141
5142 typedef struct zdb_ddt_entry {
5143 ddt_key_t zdde_key;
5144 uint64_t zdde_ref_blocks;
5145 uint64_t zdde_ref_lsize;
5146 uint64_t zdde_ref_psize;
5147 uint64_t zdde_ref_dsize;
5148 avl_node_t zdde_node;
5149 } zdb_ddt_entry_t;
5150
5151 /* ARGSUSED */
5152 static int
5153 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5154 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
5155 {
5156 avl_tree_t *t = arg;
5157 avl_index_t where;
5158 zdb_ddt_entry_t *zdde, zdde_search;
5159
5160 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
5161 BP_IS_EMBEDDED(bp))
5162 return (0);
5163
5164 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
5165 (void) printf("traversing objset %llu, %llu objects, "
5166 "%lu blocks so far\n",
5167 (u_longlong_t)zb->zb_objset,
5168 (u_longlong_t)BP_GET_FILL(bp),
5169 avl_numnodes(t));
5170 }
5171
5172 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
5173 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
5174 return (0);
5175
5176 ddt_key_fill(&zdde_search.zdde_key, bp);
5177
5178 zdde = avl_find(t, &zdde_search, &where);
5179
5180 if (zdde == NULL) {
5181 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
5182 zdde->zdde_key = zdde_search.zdde_key;
5183 avl_insert(t, zdde, where);
5184 }
5185
5186 zdde->zdde_ref_blocks += 1;
5187 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
5188 zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
5189 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
5190
5191 return (0);
5192 }
5193
5194 static void
5195 dump_simulated_ddt(spa_t *spa)
5196 {
5197 avl_tree_t t;
5198 void *cookie = NULL;
5199 zdb_ddt_entry_t *zdde;
5200 ddt_histogram_t ddh_total;
5201 ddt_stat_t dds_total;
5202
5203 bzero(&ddh_total, sizeof (ddh_total));
5204 bzero(&dds_total, sizeof (dds_total));
5205 avl_create(&t, ddt_entry_compare,
5206 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
5207
5208 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5209
5210 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
5211 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
5212
5213 spa_config_exit(spa, SCL_CONFIG, FTAG);
5214
5215 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
5216 ddt_stat_t dds;
5217 uint64_t refcnt = zdde->zdde_ref_blocks;
5218 ASSERT(refcnt != 0);
5219
5220 dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
5221 dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
5222 dds.dds_psize = zdde->zdde_ref_psize / refcnt;
5223 dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
5224
5225 dds.dds_ref_blocks = zdde->zdde_ref_blocks;
5226 dds.dds_ref_lsize = zdde->zdde_ref_lsize;
5227 dds.dds_ref_psize = zdde->zdde_ref_psize;
5228 dds.dds_ref_dsize = zdde->zdde_ref_dsize;
5229
5230 ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
5231 &dds, 0);
5232
5233 umem_free(zdde, sizeof (*zdde));
5234 }
5235
5236 avl_destroy(&t);
5237
5238 ddt_histogram_stat(&dds_total, &ddh_total);
5239
5240 (void) printf("Simulated DDT histogram:\n");
5241
5242 zpool_dump_ddt(&dds_total, &ddh_total);
5243
5244 dump_dedup_ratio(&dds_total);
5245 }
5246
5247 static int
5248 verify_device_removal_feature_counts(spa_t *spa)
5249 {
5250 uint64_t dr_feature_refcount = 0;
5251 uint64_t oc_feature_refcount = 0;
5252 uint64_t indirect_vdev_count = 0;
5253 uint64_t precise_vdev_count = 0;
5254 uint64_t obsolete_counts_object_count = 0;
5255 uint64_t obsolete_sm_count = 0;
5256 uint64_t obsolete_counts_count = 0;
5257 uint64_t scip_count = 0;
5258 uint64_t obsolete_bpobj_count = 0;
5259 int ret = 0;
5260
5261 spa_condensing_indirect_phys_t *scip =
5262 &spa->spa_condensing_indirect_phys;
5263 if (scip->scip_next_mapping_object != 0) {
5264 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
5265 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
5266 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
5267
5268 (void) printf("Condensing indirect vdev %llu: new mapping "
5269 "object %llu, prev obsolete sm %llu\n",
5270 (u_longlong_t)scip->scip_vdev,
5271 (u_longlong_t)scip->scip_next_mapping_object,
5272 (u_longlong_t)scip->scip_prev_obsolete_sm_object);
5273 if (scip->scip_prev_obsolete_sm_object != 0) {
5274 space_map_t *prev_obsolete_sm = NULL;
5275 VERIFY0(space_map_open(&prev_obsolete_sm,
5276 spa->spa_meta_objset,
5277 scip->scip_prev_obsolete_sm_object,
5278 0, vd->vdev_asize, 0));
5279 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
5280 (void) printf("\n");
5281 space_map_close(prev_obsolete_sm);
5282 }
5283
5284 scip_count += 2;
5285 }
5286
5287 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
5288 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
5289 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
5290
5291 if (vic->vic_mapping_object != 0) {
5292 ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
5293 vd->vdev_removing);
5294 indirect_vdev_count++;
5295
5296 if (vd->vdev_indirect_mapping->vim_havecounts) {
5297 obsolete_counts_count++;
5298 }
5299 }
5300
5301 boolean_t are_precise;
5302 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
5303 if (are_precise) {
5304 ASSERT(vic->vic_mapping_object != 0);
5305 precise_vdev_count++;
5306 }
5307
5308 uint64_t obsolete_sm_object;
5309 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
5310 if (obsolete_sm_object != 0) {
5311 ASSERT(vic->vic_mapping_object != 0);
5312 obsolete_sm_count++;
5313 }
5314 }
5315
5316 (void) feature_get_refcount(spa,
5317 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
5318 &dr_feature_refcount);
5319 (void) feature_get_refcount(spa,
5320 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
5321 &oc_feature_refcount);
5322
5323 if (dr_feature_refcount != indirect_vdev_count) {
5324 ret = 1;
5325 (void) printf("Number of indirect vdevs (%llu) " \
5326 "does not match feature count (%llu)\n",
5327 (u_longlong_t)indirect_vdev_count,
5328 (u_longlong_t)dr_feature_refcount);
5329 } else {
5330 (void) printf("Verified device_removal feature refcount " \
5331 "of %llu is correct\n",
5332 (u_longlong_t)dr_feature_refcount);
5333 }
5334
5335 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
5336 DMU_POOL_OBSOLETE_BPOBJ) == 0) {
5337 obsolete_bpobj_count++;
5338 }
5339
5340
5341 obsolete_counts_object_count = precise_vdev_count;
5342 obsolete_counts_object_count += obsolete_sm_count;
5343 obsolete_counts_object_count += obsolete_counts_count;
5344 obsolete_counts_object_count += scip_count;
5345 obsolete_counts_object_count += obsolete_bpobj_count;
5346 obsolete_counts_object_count += remap_deadlist_count;
5347
5348 if (oc_feature_refcount != obsolete_counts_object_count) {
5349 ret = 1;
5350 (void) printf("Number of obsolete counts objects (%llu) " \
5351 "does not match feature count (%llu)\n",
5352 (u_longlong_t)obsolete_counts_object_count,
5353 (u_longlong_t)oc_feature_refcount);
5354 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
5355 "ob:%llu rd:%llu\n",
5356 (u_longlong_t)precise_vdev_count,
5357 (u_longlong_t)obsolete_sm_count,
5358 (u_longlong_t)obsolete_counts_count,
5359 (u_longlong_t)scip_count,
5360 (u_longlong_t)obsolete_bpobj_count,
5361 (u_longlong_t)remap_deadlist_count);
5362 } else {
5363 (void) printf("Verified indirect_refcount feature refcount " \
5364 "of %llu is correct\n",
5365 (u_longlong_t)oc_feature_refcount);
5366 }
5367 return (ret);
5368 }
5369
5370 static void
5371 zdb_set_skip_mmp(char *target)
5372 {
5373 spa_t *spa;
5374
5375 /*
5376 * Disable the activity check to allow examination of
5377 * active pools.
5378 */
5379 mutex_enter(&spa_namespace_lock);
5380 if ((spa = spa_lookup(target)) != NULL) {
5381 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
5382 }
5383 mutex_exit(&spa_namespace_lock);
5384 }
5385
5386 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
5387 /*
5388 * Import the checkpointed state of the pool specified by the target
5389 * parameter as readonly. The function also accepts a pool config
5390 * as an optional parameter, else it attempts to infer the config by
5391 * the name of the target pool.
5392 *
5393 * Note that the checkpointed state's pool name will be the name of
5394 * the original pool with the above suffix appended to it. In addition,
5395 * if the target is not a pool name (e.g. a path to a dataset) then
5396 * the new_path parameter is populated with the updated path to
5397 * reflect the fact that we are looking into the checkpointed state.
5398 *
5399 * The function returns a newly-allocated copy of the name of the
5400 * pool containing the checkpointed state. When this copy is no
5401 * longer needed it should be freed with free(3C). Same thing
5402 * applies to the new_path parameter if allocated.
5403 */
5404 static char *
5405 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
5406 {
5407 int error = 0;
5408 char *poolname, *bogus_name = NULL;
5409
5410 /* If the target is not a pool, the extract the pool name */
5411 char *path_start = strchr(target, '/');
5412 if (path_start != NULL) {
5413 size_t poolname_len = path_start - target;
5414 poolname = strndup(target, poolname_len);
5415 } else {
5416 poolname = target;
5417 }
5418
5419 if (cfg == NULL) {
5420 zdb_set_skip_mmp(poolname);
5421 error = spa_get_stats(poolname, &cfg, NULL, 0);
5422 if (error != 0) {
5423 fatal("Tried to read config of pool \"%s\" but "
5424 "spa_get_stats() failed with error %d\n",
5425 poolname, error);
5426 }
5427 }
5428
5429 if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
5430 return (NULL);
5431 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
5432
5433 error = spa_import(bogus_name, cfg, NULL,
5434 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
5435 ZFS_IMPORT_SKIP_MMP);
5436 if (error != 0) {
5437 fatal("Tried to import pool \"%s\" but spa_import() failed "
5438 "with error %d\n", bogus_name, error);
5439 }
5440
5441 if (new_path != NULL && path_start != NULL) {
5442 if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
5443 if (path_start != NULL)
5444 free(poolname);
5445 return (NULL);
5446 }
5447 }
5448
5449 if (target != poolname)
5450 free(poolname);
5451
5452 return (bogus_name);
5453 }
5454
5455 typedef struct verify_checkpoint_sm_entry_cb_arg {
5456 vdev_t *vcsec_vd;
5457
5458 /* the following fields are only used for printing progress */
5459 uint64_t vcsec_entryid;
5460 uint64_t vcsec_num_entries;
5461 } verify_checkpoint_sm_entry_cb_arg_t;
5462
5463 #define ENTRIES_PER_PROGRESS_UPDATE 10000
5464
5465 static int
5466 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
5467 {
5468 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
5469 vdev_t *vd = vcsec->vcsec_vd;
5470 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
5471 uint64_t end = sme->sme_offset + sme->sme_run;
5472
5473 ASSERT(sme->sme_type == SM_FREE);
5474
5475 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
5476 (void) fprintf(stderr,
5477 "\rverifying vdev %llu, space map entry %llu of %llu ...",
5478 (longlong_t)vd->vdev_id,
5479 (longlong_t)vcsec->vcsec_entryid,
5480 (longlong_t)vcsec->vcsec_num_entries);
5481 }
5482 vcsec->vcsec_entryid++;
5483
5484 /*
5485 * See comment in checkpoint_sm_exclude_entry_cb()
5486 */
5487 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
5488 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
5489
5490 /*
5491 * The entries in the vdev_checkpoint_sm should be marked as
5492 * allocated in the checkpointed state of the pool, therefore
5493 * their respective ms_allocateable trees should not contain them.
5494 */
5495 mutex_enter(&ms->ms_lock);
5496 range_tree_verify_not_present(ms->ms_allocatable,
5497 sme->sme_offset, sme->sme_run);
5498 mutex_exit(&ms->ms_lock);
5499
5500 return (0);
5501 }
5502
5503 /*
5504 * Verify that all segments in the vdev_checkpoint_sm are allocated
5505 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
5506 * ms_allocatable).
5507 *
5508 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
5509 * each vdev in the current state of the pool to the metaslab space maps
5510 * (ms_sm) of the checkpointed state of the pool.
5511 *
5512 * Note that the function changes the state of the ms_allocatable
5513 * trees of the current spa_t. The entries of these ms_allocatable
5514 * trees are cleared out and then repopulated from with the free
5515 * entries of their respective ms_sm space maps.
5516 */
5517 static void
5518 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
5519 {
5520 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
5521 vdev_t *current_rvd = current->spa_root_vdev;
5522
5523 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
5524
5525 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
5526 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
5527 vdev_t *current_vd = current_rvd->vdev_child[c];
5528
5529 space_map_t *checkpoint_sm = NULL;
5530 uint64_t checkpoint_sm_obj;
5531
5532 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
5533 /*
5534 * Since we don't allow device removal in a pool
5535 * that has a checkpoint, we expect that all removed
5536 * vdevs were removed from the pool before the
5537 * checkpoint.
5538 */
5539 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
5540 continue;
5541 }
5542
5543 /*
5544 * If the checkpoint space map doesn't exist, then nothing
5545 * here is checkpointed so there's nothing to verify.
5546 */
5547 if (current_vd->vdev_top_zap == 0 ||
5548 zap_contains(spa_meta_objset(current),
5549 current_vd->vdev_top_zap,
5550 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
5551 continue;
5552
5553 VERIFY0(zap_lookup(spa_meta_objset(current),
5554 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
5555 sizeof (uint64_t), 1, &checkpoint_sm_obj));
5556
5557 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
5558 checkpoint_sm_obj, 0, current_vd->vdev_asize,
5559 current_vd->vdev_ashift));
5560
5561 verify_checkpoint_sm_entry_cb_arg_t vcsec;
5562 vcsec.vcsec_vd = ckpoint_vd;
5563 vcsec.vcsec_entryid = 0;
5564 vcsec.vcsec_num_entries =
5565 space_map_length(checkpoint_sm) / sizeof (uint64_t);
5566 VERIFY0(space_map_iterate(checkpoint_sm,
5567 space_map_length(checkpoint_sm),
5568 verify_checkpoint_sm_entry_cb, &vcsec));
5569 if (dump_opt['m'] > 3)
5570 dump_spacemap(current->spa_meta_objset, checkpoint_sm);
5571 space_map_close(checkpoint_sm);
5572 }
5573
5574 /*
5575 * If we've added vdevs since we took the checkpoint, ensure
5576 * that their checkpoint space maps are empty.
5577 */
5578 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
5579 for (uint64_t c = ckpoint_rvd->vdev_children;
5580 c < current_rvd->vdev_children; c++) {
5581 vdev_t *current_vd = current_rvd->vdev_child[c];
5582 ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
5583 }
5584 }
5585
5586 /* for cleaner progress output */
5587 (void) fprintf(stderr, "\n");
5588 }
5589
5590 /*
5591 * Verifies that all space that's allocated in the checkpoint is
5592 * still allocated in the current version, by checking that everything
5593 * in checkpoint's ms_allocatable (which is actually allocated, not
5594 * allocatable/free) is not present in current's ms_allocatable.
5595 *
5596 * Note that the function changes the state of the ms_allocatable
5597 * trees of both spas when called. The entries of all ms_allocatable
5598 * trees are cleared out and then repopulated from their respective
5599 * ms_sm space maps. In the checkpointed state we load the allocated
5600 * entries, and in the current state we load the free entries.
5601 */
5602 static void
5603 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
5604 {
5605 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
5606 vdev_t *current_rvd = current->spa_root_vdev;
5607
5608 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
5609 load_concrete_ms_allocatable_trees(current, SM_FREE);
5610
5611 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
5612 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
5613 vdev_t *current_vd = current_rvd->vdev_child[i];
5614
5615 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
5616 /*
5617 * See comment in verify_checkpoint_vdev_spacemaps()
5618 */
5619 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
5620 continue;
5621 }
5622
5623 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
5624 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
5625 metaslab_t *current_msp = current_vd->vdev_ms[m];
5626
5627 (void) fprintf(stderr,
5628 "\rverifying vdev %llu of %llu, "
5629 "metaslab %llu of %llu ...",
5630 (longlong_t)current_vd->vdev_id,
5631 (longlong_t)current_rvd->vdev_children,
5632 (longlong_t)current_vd->vdev_ms[m]->ms_id,
5633 (longlong_t)current_vd->vdev_ms_count);
5634
5635 /*
5636 * We walk through the ms_allocatable trees that
5637 * are loaded with the allocated blocks from the
5638 * ms_sm spacemaps of the checkpoint. For each
5639 * one of these ranges we ensure that none of them
5640 * exists in the ms_allocatable trees of the
5641 * current state which are loaded with the ranges
5642 * that are currently free.
5643 *
5644 * This way we ensure that none of the blocks that
5645 * are part of the checkpoint were freed by mistake.
5646 */
5647 range_tree_walk(ckpoint_msp->ms_allocatable,
5648 (range_tree_func_t *)range_tree_verify_not_present,
5649 current_msp->ms_allocatable);
5650 }
5651 }
5652
5653 /* for cleaner progress output */
5654 (void) fprintf(stderr, "\n");
5655 }
5656
5657 static void
5658 verify_checkpoint_blocks(spa_t *spa)
5659 {
5660 ASSERT(!dump_opt['L']);
5661
5662 spa_t *checkpoint_spa;
5663 char *checkpoint_pool;
5664 nvlist_t *config = NULL;
5665 int error = 0;
5666
5667 /*
5668 * We import the checkpointed state of the pool (under a different
5669 * name) so we can do verification on it against the current state
5670 * of the pool.
5671 */
5672 checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
5673 NULL);
5674 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
5675
5676 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
5677 if (error != 0) {
5678 fatal("Tried to open pool \"%s\" but spa_open() failed with "
5679 "error %d\n", checkpoint_pool, error);
5680 }
5681
5682 /*
5683 * Ensure that ranges in the checkpoint space maps of each vdev
5684 * are allocated according to the checkpointed state's metaslab
5685 * space maps.
5686 */
5687 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
5688
5689 /*
5690 * Ensure that allocated ranges in the checkpoint's metaslab
5691 * space maps remain allocated in the metaslab space maps of
5692 * the current state.
5693 */
5694 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
5695
5696 /*
5697 * Once we are done, we get rid of the checkpointed state.
5698 */
5699 spa_close(checkpoint_spa, FTAG);
5700 free(checkpoint_pool);
5701 }
5702
5703 static void
5704 dump_leftover_checkpoint_blocks(spa_t *spa)
5705 {
5706 vdev_t *rvd = spa->spa_root_vdev;
5707
5708 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
5709 vdev_t *vd = rvd->vdev_child[i];
5710
5711 space_map_t *checkpoint_sm = NULL;
5712 uint64_t checkpoint_sm_obj;
5713
5714 if (vd->vdev_top_zap == 0)
5715 continue;
5716
5717 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
5718 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
5719 continue;
5720
5721 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
5722 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
5723 sizeof (uint64_t), 1, &checkpoint_sm_obj));
5724
5725 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
5726 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
5727 dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
5728 space_map_close(checkpoint_sm);
5729 }
5730 }
5731
5732 static int
5733 verify_checkpoint(spa_t *spa)
5734 {
5735 uberblock_t checkpoint;
5736 int error;
5737
5738 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
5739 return (0);
5740
5741 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
5742 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
5743 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
5744
5745 if (error == ENOENT && !dump_opt['L']) {
5746 /*
5747 * If the feature is active but the uberblock is missing
5748 * then we must be in the middle of discarding the
5749 * checkpoint.
5750 */
5751 (void) printf("\nPartially discarded checkpoint "
5752 "state found:\n");
5753 if (dump_opt['m'] > 3)
5754 dump_leftover_checkpoint_blocks(spa);
5755 return (0);
5756 } else if (error != 0) {
5757 (void) printf("lookup error %d when looking for "
5758 "checkpointed uberblock in MOS\n", error);
5759 return (error);
5760 }
5761 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
5762
5763 if (checkpoint.ub_checkpoint_txg == 0) {
5764 (void) printf("\nub_checkpoint_txg not set in checkpointed "
5765 "uberblock\n");
5766 error = 3;
5767 }
5768
5769 if (error == 0 && !dump_opt['L'])
5770 verify_checkpoint_blocks(spa);
5771
5772 return (error);
5773 }
5774
5775 /* ARGSUSED */
5776 static void
5777 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
5778 {
5779 for (uint64_t i = start; i < size; i++) {
5780 (void) printf("MOS object %llu referenced but not allocated\n",
5781 (u_longlong_t)i);
5782 }
5783 }
5784
5785 static void
5786 mos_obj_refd(uint64_t obj)
5787 {
5788 if (obj != 0 && mos_refd_objs != NULL)
5789 range_tree_add(mos_refd_objs, obj, 1);
5790 }
5791
5792 /*
5793 * Call on a MOS object that may already have been referenced.
5794 */
5795 static void
5796 mos_obj_refd_multiple(uint64_t obj)
5797 {
5798 if (obj != 0 && mos_refd_objs != NULL &&
5799 !range_tree_contains(mos_refd_objs, obj, 1))
5800 range_tree_add(mos_refd_objs, obj, 1);
5801 }
5802
5803 static void
5804 mos_leak_vdev_top_zap(vdev_t *vd)
5805 {
5806 uint64_t ms_flush_data_obj;
5807 int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
5808 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
5809 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
5810 if (error == ENOENT)
5811 return;
5812 ASSERT0(error);
5813
5814 mos_obj_refd(ms_flush_data_obj);
5815 }
5816
5817 static void
5818 mos_leak_vdev(vdev_t *vd)
5819 {
5820 mos_obj_refd(vd->vdev_dtl_object);
5821 mos_obj_refd(vd->vdev_ms_array);
5822 mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
5823 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
5824 mos_obj_refd(vd->vdev_leaf_zap);
5825 if (vd->vdev_checkpoint_sm != NULL)
5826 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
5827 if (vd->vdev_indirect_mapping != NULL) {
5828 mos_obj_refd(vd->vdev_indirect_mapping->
5829 vim_phys->vimp_counts_object);
5830 }
5831 if (vd->vdev_obsolete_sm != NULL)
5832 mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
5833
5834 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
5835 metaslab_t *ms = vd->vdev_ms[m];
5836 mos_obj_refd(space_map_object(ms->ms_sm));
5837 }
5838
5839 if (vd->vdev_top_zap != 0) {
5840 mos_obj_refd(vd->vdev_top_zap);
5841 mos_leak_vdev_top_zap(vd);
5842 }
5843
5844 for (uint64_t c = 0; c < vd->vdev_children; c++) {
5845 mos_leak_vdev(vd->vdev_child[c]);
5846 }
5847 }
5848
5849 static void
5850 mos_leak_log_spacemaps(spa_t *spa)
5851 {
5852 uint64_t spacemap_zap;
5853 int error = zap_lookup(spa_meta_objset(spa),
5854 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
5855 sizeof (spacemap_zap), 1, &spacemap_zap);
5856 if (error == ENOENT)
5857 return;
5858 ASSERT0(error);
5859
5860 mos_obj_refd(spacemap_zap);
5861 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
5862 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
5863 mos_obj_refd(sls->sls_sm_obj);
5864 }
5865
5866 static int
5867 dump_mos_leaks(spa_t *spa)
5868 {
5869 int rv = 0;
5870 objset_t *mos = spa->spa_meta_objset;
5871 dsl_pool_t *dp = spa->spa_dsl_pool;
5872
5873 /* Visit and mark all referenced objects in the MOS */
5874
5875 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
5876 mos_obj_refd(spa->spa_pool_props_object);
5877 mos_obj_refd(spa->spa_config_object);
5878 mos_obj_refd(spa->spa_ddt_stat_object);
5879 mos_obj_refd(spa->spa_feat_desc_obj);
5880 mos_obj_refd(spa->spa_feat_enabled_txg_obj);
5881 mos_obj_refd(spa->spa_feat_for_read_obj);
5882 mos_obj_refd(spa->spa_feat_for_write_obj);
5883 mos_obj_refd(spa->spa_history);
5884 mos_obj_refd(spa->spa_errlog_last);
5885 mos_obj_refd(spa->spa_errlog_scrub);
5886 mos_obj_refd(spa->spa_all_vdev_zaps);
5887 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
5888 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
5889 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
5890 bpobj_count_refd(&spa->spa_deferred_bpobj);
5891 mos_obj_refd(dp->dp_empty_bpobj);
5892 bpobj_count_refd(&dp->dp_obsolete_bpobj);
5893 bpobj_count_refd(&dp->dp_free_bpobj);
5894 mos_obj_refd(spa->spa_l2cache.sav_object);
5895 mos_obj_refd(spa->spa_spares.sav_object);
5896
5897 if (spa->spa_syncing_log_sm != NULL)
5898 mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
5899 mos_leak_log_spacemaps(spa);
5900
5901 mos_obj_refd(spa->spa_condensing_indirect_phys.
5902 scip_next_mapping_object);
5903 mos_obj_refd(spa->spa_condensing_indirect_phys.
5904 scip_prev_obsolete_sm_object);
5905 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
5906 vdev_indirect_mapping_t *vim =
5907 vdev_indirect_mapping_open(mos,
5908 spa->spa_condensing_indirect_phys.scip_next_mapping_object);
5909 mos_obj_refd(vim->vim_phys->vimp_counts_object);
5910 vdev_indirect_mapping_close(vim);
5911 }
5912 deleted_livelists_dump_mos(spa);
5913
5914 if (dp->dp_origin_snap != NULL) {
5915 dsl_dataset_t *ds;
5916
5917 dsl_pool_config_enter(dp, FTAG);
5918 VERIFY0(dsl_dataset_hold_obj(dp,
5919 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
5920 FTAG, &ds));
5921 count_ds_mos_objects(ds);
5922 dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
5923 dsl_dataset_rele(ds, FTAG);
5924 dsl_pool_config_exit(dp, FTAG);
5925
5926 count_ds_mos_objects(dp->dp_origin_snap);
5927 dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
5928 }
5929 count_dir_mos_objects(dp->dp_mos_dir);
5930 if (dp->dp_free_dir != NULL)
5931 count_dir_mos_objects(dp->dp_free_dir);
5932 if (dp->dp_leak_dir != NULL)
5933 count_dir_mos_objects(dp->dp_leak_dir);
5934
5935 mos_leak_vdev(spa->spa_root_vdev);
5936
5937 for (uint64_t class = 0; class < DDT_CLASSES; class++) {
5938 for (uint64_t type = 0; type < DDT_TYPES; type++) {
5939 for (uint64_t cksum = 0;
5940 cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
5941 ddt_t *ddt = spa->spa_ddt[cksum];
5942 mos_obj_refd(ddt->ddt_object[type][class]);
5943 }
5944 }
5945 }
5946
5947 /*
5948 * Visit all allocated objects and make sure they are referenced.
5949 */
5950 uint64_t object = 0;
5951 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
5952 if (range_tree_contains(mos_refd_objs, object, 1)) {
5953 range_tree_remove(mos_refd_objs, object, 1);
5954 } else {
5955 dmu_object_info_t doi;
5956 const char *name;
5957 dmu_object_info(mos, object, &doi);
5958 if (doi.doi_type & DMU_OT_NEWTYPE) {
5959 dmu_object_byteswap_t bswap =
5960 DMU_OT_BYTESWAP(doi.doi_type);
5961 name = dmu_ot_byteswap[bswap].ob_name;
5962 } else {
5963 name = dmu_ot[doi.doi_type].ot_name;
5964 }
5965
5966 (void) printf("MOS object %llu (%s) leaked\n",
5967 (u_longlong_t)object, name);
5968 rv = 2;
5969 }
5970 }
5971 (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
5972 if (!range_tree_is_empty(mos_refd_objs))
5973 rv = 2;
5974 range_tree_vacate(mos_refd_objs, NULL, NULL);
5975 range_tree_destroy(mos_refd_objs);
5976 return (rv);
5977 }
5978
5979 typedef struct log_sm_obsolete_stats_arg {
5980 uint64_t lsos_current_txg;
5981
5982 uint64_t lsos_total_entries;
5983 uint64_t lsos_valid_entries;
5984
5985 uint64_t lsos_sm_entries;
5986 uint64_t lsos_valid_sm_entries;
5987 } log_sm_obsolete_stats_arg_t;
5988
5989 static int
5990 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
5991 uint64_t txg, void *arg)
5992 {
5993 log_sm_obsolete_stats_arg_t *lsos = arg;
5994
5995 uint64_t offset = sme->sme_offset;
5996 uint64_t vdev_id = sme->sme_vdev;
5997
5998 if (lsos->lsos_current_txg == 0) {
5999 /* this is the first log */
6000 lsos->lsos_current_txg = txg;
6001 } else if (lsos->lsos_current_txg < txg) {
6002 /* we just changed log - print stats and reset */
6003 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
6004 (u_longlong_t)lsos->lsos_valid_sm_entries,
6005 (u_longlong_t)lsos->lsos_sm_entries,
6006 (u_longlong_t)lsos->lsos_current_txg);
6007 lsos->lsos_valid_sm_entries = 0;
6008 lsos->lsos_sm_entries = 0;
6009 lsos->lsos_current_txg = txg;
6010 }
6011 ASSERT3U(lsos->lsos_current_txg, ==, txg);
6012
6013 lsos->lsos_sm_entries++;
6014 lsos->lsos_total_entries++;
6015
6016 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6017 if (!vdev_is_concrete(vd))
6018 return (0);
6019
6020 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6021 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6022
6023 if (txg < metaslab_unflushed_txg(ms))
6024 return (0);
6025 lsos->lsos_valid_sm_entries++;
6026 lsos->lsos_valid_entries++;
6027 return (0);
6028 }
6029
6030 static void
6031 dump_log_spacemap_obsolete_stats(spa_t *spa)
6032 {
6033 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
6034 return;
6035
6036 log_sm_obsolete_stats_arg_t lsos;
6037 bzero(&lsos, sizeof (lsos));
6038
6039 (void) printf("Log Space Map Obsolete Entry Statistics:\n");
6040
6041 iterate_through_spacemap_logs(spa,
6042 log_spacemap_obsolete_stats_cb, &lsos);
6043
6044 /* print stats for latest log */
6045 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
6046 (u_longlong_t)lsos.lsos_valid_sm_entries,
6047 (u_longlong_t)lsos.lsos_sm_entries,
6048 (u_longlong_t)lsos.lsos_current_txg);
6049
6050 (void) printf("%-8llu valid entries out of %-8llu - total\n\n",
6051 (u_longlong_t)lsos.lsos_valid_entries,
6052 (u_longlong_t)lsos.lsos_total_entries);
6053 }
6054
6055 static void
6056 dump_zpool(spa_t *spa)
6057 {
6058 dsl_pool_t *dp = spa_get_dsl(spa);
6059 int rc = 0;
6060
6061 if (dump_opt['S']) {
6062 dump_simulated_ddt(spa);
6063 return;
6064 }
6065
6066 if (!dump_opt['e'] && dump_opt['C'] > 1) {
6067 (void) printf("\nCached configuration:\n");
6068 dump_nvlist(spa->spa_config, 8);
6069 }
6070
6071 if (dump_opt['C'])
6072 dump_config(spa);
6073
6074 if (dump_opt['u'])
6075 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
6076
6077 if (dump_opt['D'])
6078 dump_all_ddts(spa);
6079
6080 if (dump_opt['d'] > 2 || dump_opt['m'])
6081 dump_metaslabs(spa);
6082 if (dump_opt['M'])
6083 dump_metaslab_groups(spa);
6084 if (dump_opt['d'] > 2 || dump_opt['m']) {
6085 dump_log_spacemaps(spa);
6086 dump_log_spacemap_obsolete_stats(spa);
6087 }
6088
6089 if (dump_opt['d'] || dump_opt['i']) {
6090 spa_feature_t f;
6091 mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
6092 0);
6093 dump_objset(dp->dp_meta_objset);
6094
6095 if (dump_opt['d'] >= 3) {
6096 dsl_pool_t *dp = spa->spa_dsl_pool;
6097 dump_full_bpobj(&spa->spa_deferred_bpobj,
6098 "Deferred frees", 0);
6099 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
6100 dump_full_bpobj(&dp->dp_free_bpobj,
6101 "Pool snapshot frees", 0);
6102 }
6103 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
6104 ASSERT(spa_feature_is_enabled(spa,
6105 SPA_FEATURE_DEVICE_REMOVAL));
6106 dump_full_bpobj(&dp->dp_obsolete_bpobj,
6107 "Pool obsolete blocks", 0);
6108 }
6109
6110 if (spa_feature_is_active(spa,
6111 SPA_FEATURE_ASYNC_DESTROY)) {
6112 dump_bptree(spa->spa_meta_objset,
6113 dp->dp_bptree_obj,
6114 "Pool dataset frees");
6115 }
6116 dump_dtl(spa->spa_root_vdev, 0);
6117 }
6118
6119 for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
6120 global_feature_count[f] = UINT64_MAX;
6121 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
6122 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
6123 global_feature_count[SPA_FEATURE_LIVELIST] = 0;
6124
6125 (void) dmu_objset_find(spa_name(spa), dump_one_objset,
6126 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
6127
6128 if (rc == 0 && !dump_opt['L'])
6129 rc = dump_mos_leaks(spa);
6130
6131 for (f = 0; f < SPA_FEATURES; f++) {
6132 uint64_t refcount;
6133
6134 uint64_t *arr;
6135 if (!(spa_feature_table[f].fi_flags &
6136 ZFEATURE_FLAG_PER_DATASET)) {
6137 if (global_feature_count[f] == UINT64_MAX)
6138 continue;
6139 if (!spa_feature_is_enabled(spa, f)) {
6140 ASSERT0(global_feature_count[f]);
6141 continue;
6142 }
6143 arr = global_feature_count;
6144 } else {
6145 if (!spa_feature_is_enabled(spa, f)) {
6146 ASSERT0(dataset_feature_count[f]);
6147 continue;
6148 }
6149 arr = dataset_feature_count;
6150 }
6151 if (feature_get_refcount(spa, &spa_feature_table[f],
6152 &refcount) == ENOTSUP)
6153 continue;
6154 if (arr[f] != refcount) {
6155 (void) printf("%s feature refcount mismatch: "
6156 "%lld consumers != %lld refcount\n",
6157 spa_feature_table[f].fi_uname,
6158 (longlong_t)arr[f], (longlong_t)refcount);
6159 rc = 2;
6160 } else {
6161 (void) printf("Verified %s feature refcount "
6162 "of %llu is correct\n",
6163 spa_feature_table[f].fi_uname,
6164 (longlong_t)refcount);
6165 }
6166 }
6167
6168 if (rc == 0)
6169 rc = verify_device_removal_feature_counts(spa);
6170 }
6171
6172 if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
6173 rc = dump_block_stats(spa);
6174
6175 if (rc == 0)
6176 rc = verify_spacemap_refcounts(spa);
6177
6178 if (dump_opt['s'])
6179 show_pool_stats(spa);
6180
6181 if (dump_opt['h'])
6182 dump_history(spa);
6183
6184 if (rc == 0)
6185 rc = verify_checkpoint(spa);
6186
6187 if (rc != 0) {
6188 dump_debug_buffer();
6189 exit(rc);
6190 }
6191 }
6192
6193 #define ZDB_FLAG_CHECKSUM 0x0001
6194 #define ZDB_FLAG_DECOMPRESS 0x0002
6195 #define ZDB_FLAG_BSWAP 0x0004
6196 #define ZDB_FLAG_GBH 0x0008
6197 #define ZDB_FLAG_INDIRECT 0x0010
6198 #define ZDB_FLAG_PHYS 0x0020
6199 #define ZDB_FLAG_RAW 0x0040
6200 #define ZDB_FLAG_PRINT_BLKPTR 0x0080
6201
6202 static int flagbits[256];
6203
6204 static void
6205 zdb_print_blkptr(blkptr_t *bp, int flags)
6206 {
6207 char blkbuf[BP_SPRINTF_LEN];
6208
6209 if (flags & ZDB_FLAG_BSWAP)
6210 byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
6211
6212 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
6213 (void) printf("%s\n", blkbuf);
6214 }
6215
6216 static void
6217 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
6218 {
6219 int i;
6220
6221 for (i = 0; i < nbps; i++)
6222 zdb_print_blkptr(&bp[i], flags);
6223 }
6224
6225 static void
6226 zdb_dump_gbh(void *buf, int flags)
6227 {
6228 zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
6229 }
6230
6231 static void
6232 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
6233 {
6234 if (flags & ZDB_FLAG_BSWAP)
6235 byteswap_uint64_array(buf, size);
6236 VERIFY(write(fileno(stdout), buf, size) == size);
6237 }
6238
6239 static void
6240 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
6241 {
6242 uint64_t *d = (uint64_t *)buf;
6243 unsigned nwords = size / sizeof (uint64_t);
6244 int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
6245 unsigned i, j;
6246 const char *hdr;
6247 char *c;
6248
6249
6250 if (do_bswap)
6251 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
6252 else
6253 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
6254
6255 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
6256
6257 #ifdef _LITTLE_ENDIAN
6258 /* correct the endianness */
6259 do_bswap = !do_bswap;
6260 #endif
6261 for (i = 0; i < nwords; i += 2) {
6262 (void) printf("%06llx: %016llx %016llx ",
6263 (u_longlong_t)(i * sizeof (uint64_t)),
6264 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
6265 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
6266
6267 c = (char *)&d[i];
6268 for (j = 0; j < 2 * sizeof (uint64_t); j++)
6269 (void) printf("%c", isprint(c[j]) ? c[j] : '.');
6270 (void) printf("\n");
6271 }
6272 }
6273
6274 /*
6275 * There are two acceptable formats:
6276 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
6277 * child[.child]* - For example: 0.1.1
6278 *
6279 * The second form can be used to specify arbitrary vdevs anywhere
6280 * in the hierarchy. For example, in a pool with a mirror of
6281 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
6282 */
6283 static vdev_t *
6284 zdb_vdev_lookup(vdev_t *vdev, const char *path)
6285 {
6286 char *s, *p, *q;
6287 unsigned i;
6288
6289 if (vdev == NULL)
6290 return (NULL);
6291
6292 /* First, assume the x.x.x.x format */
6293 i = strtoul(path, &s, 10);
6294 if (s == path || (s && *s != '.' && *s != '\0'))
6295 goto name;
6296 if (i >= vdev->vdev_children)
6297 return (NULL);
6298
6299 vdev = vdev->vdev_child[i];
6300 if (s && *s == '\0')
6301 return (vdev);
6302 return (zdb_vdev_lookup(vdev, s+1));
6303
6304 name:
6305 for (i = 0; i < vdev->vdev_children; i++) {
6306 vdev_t *vc = vdev->vdev_child[i];
6307
6308 if (vc->vdev_path == NULL) {
6309 vc = zdb_vdev_lookup(vc, path);
6310 if (vc == NULL)
6311 continue;
6312 else
6313 return (vc);
6314 }
6315
6316 p = strrchr(vc->vdev_path, '/');
6317 p = p ? p + 1 : vc->vdev_path;
6318 q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
6319
6320 if (strcmp(vc->vdev_path, path) == 0)
6321 return (vc);
6322 if (strcmp(p, path) == 0)
6323 return (vc);
6324 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
6325 return (vc);
6326 }
6327
6328 return (NULL);
6329 }
6330
6331 /*
6332 * Read a block from a pool and print it out. The syntax of the
6333 * block descriptor is:
6334 *
6335 * pool:vdev_specifier:offset:size[:flags]
6336 *
6337 * pool - The name of the pool you wish to read from
6338 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
6339 * offset - offset, in hex, in bytes
6340 * size - Amount of data to read, in hex, in bytes
6341 * flags - A string of characters specifying options
6342 * b: Decode a blkptr at given offset within block
6343 * *c: Calculate and display checksums
6344 * d: Decompress data before dumping
6345 * e: Byteswap data before dumping
6346 * g: Display data as a gang block header
6347 * i: Display as an indirect block
6348 * p: Do I/O to physical offset
6349 * r: Dump raw data to stdout
6350 *
6351 * * = not yet implemented
6352 */
6353 static void
6354 zdb_read_block(char *thing, spa_t *spa)
6355 {
6356 blkptr_t blk, *bp = &blk;
6357 dva_t *dva = bp->blk_dva;
6358 int flags = 0;
6359 uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
6360 zio_t *zio;
6361 vdev_t *vd;
6362 abd_t *pabd;
6363 void *lbuf, *buf;
6364 const char *s, *vdev;
6365 char *p, *dup, *flagstr;
6366 int i, error;
6367 boolean_t borrowed = B_FALSE;
6368
6369 dup = strdup(thing);
6370 s = strtok(dup, ":");
6371 vdev = s ? s : "";
6372 s = strtok(NULL, ":");
6373 offset = strtoull(s ? s : "", NULL, 16);
6374 s = strtok(NULL, ":");
6375 size = strtoull(s ? s : "", NULL, 16);
6376 s = strtok(NULL, ":");
6377 if (s)
6378 flagstr = strdup(s);
6379 else
6380 flagstr = strdup("");
6381
6382 s = NULL;
6383 if (size == 0)
6384 s = "size must not be zero";
6385 if (!IS_P2ALIGNED(size, DEV_BSIZE))
6386 s = "size must be a multiple of sector size";
6387 if (!IS_P2ALIGNED(offset, DEV_BSIZE))
6388 s = "offset must be a multiple of sector size";
6389 if (s) {
6390 (void) printf("Invalid block specifier: %s - %s\n", thing, s);
6391 free(flagstr);
6392 free(dup);
6393 return;
6394 }
6395
6396 for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
6397 for (i = 0; flagstr[i]; i++) {
6398 int bit = flagbits[(uchar_t)flagstr[i]];
6399
6400 if (bit == 0) {
6401 (void) printf("***Invalid flag: %c\n",
6402 flagstr[i]);
6403 continue;
6404 }
6405 flags |= bit;
6406
6407 /* If it's not something with an argument, keep going */
6408 if ((bit & (ZDB_FLAG_CHECKSUM |
6409 ZDB_FLAG_PRINT_BLKPTR)) == 0)
6410 continue;
6411
6412 p = &flagstr[i + 1];
6413 if (bit == ZDB_FLAG_PRINT_BLKPTR) {
6414 blkptr_offset = strtoull(p, &p, 16);
6415 i = p - &flagstr[i + 1];
6416 }
6417 if (*p != ':' && *p != '\0') {
6418 (void) printf("***Invalid flag arg: '%s'\n", s);
6419 free(flagstr);
6420 free(dup);
6421 return;
6422 }
6423 }
6424 }
6425 free(flagstr);
6426
6427 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
6428 if (vd == NULL) {
6429 (void) printf("***Invalid vdev: %s\n", vdev);
6430 free(dup);
6431 return;
6432 } else {
6433 if (vd->vdev_path)
6434 (void) fprintf(stderr, "Found vdev: %s\n",
6435 vd->vdev_path);
6436 else
6437 (void) fprintf(stderr, "Found vdev type: %s\n",
6438 vd->vdev_ops->vdev_op_type);
6439 }
6440
6441 psize = size;
6442 lsize = size;
6443
6444 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
6445 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
6446
6447 BP_ZERO(bp);
6448
6449 DVA_SET_VDEV(&dva[0], vd->vdev_id);
6450 DVA_SET_OFFSET(&dva[0], offset);
6451 DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
6452 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
6453
6454 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
6455
6456 BP_SET_LSIZE(bp, lsize);
6457 BP_SET_PSIZE(bp, psize);
6458 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
6459 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
6460 BP_SET_TYPE(bp, DMU_OT_NONE);
6461 BP_SET_LEVEL(bp, 0);
6462 BP_SET_DEDUP(bp, 0);
6463 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
6464
6465 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6466 zio = zio_root(spa, NULL, NULL, 0);
6467
6468 if (vd == vd->vdev_top) {
6469 /*
6470 * Treat this as a normal block read.
6471 */
6472 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
6473 ZIO_PRIORITY_SYNC_READ,
6474 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
6475 } else {
6476 /*
6477 * Treat this as a vdev child I/O.
6478 */
6479 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
6480 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
6481 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
6482 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
6483 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
6484 NULL, NULL));
6485 }
6486
6487 error = zio_wait(zio);
6488 spa_config_exit(spa, SCL_STATE, FTAG);
6489
6490 if (error) {
6491 (void) printf("Read of %s failed, error: %d\n", thing, error);
6492 goto out;
6493 }
6494
6495 if (flags & ZDB_FLAG_DECOMPRESS) {
6496 /*
6497 * We don't know how the data was compressed, so just try
6498 * every decompress function at every inflated blocksize.
6499 */
6500 enum zio_compress c;
6501 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
6502
6503 /*
6504 * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB,
6505 * this could take a while and we should let the user know
6506 * we are not stuck. On the other hand, printing progress
6507 * info gets old after a while. What to do?
6508 */
6509 for (lsize = psize + SPA_MINBLOCKSIZE;
6510 lsize <= SPA_MAXBLOCKSIZE; lsize += SPA_MINBLOCKSIZE) {
6511 for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
6512 /*
6513 * ZLE can easily decompress non zle stream.
6514 * So have an option to disable it.
6515 */
6516 if (c == ZIO_COMPRESS_ZLE &&
6517 getenv("ZDB_NO_ZLE"))
6518 continue;
6519
6520 (void) fprintf(stderr,
6521 "Trying %05llx -> %05llx (%s)\n",
6522 (u_longlong_t)psize, (u_longlong_t)lsize,
6523 zio_compress_table[c].ci_name);
6524
6525 /*
6526 * We randomize lbuf2, and decompress to both
6527 * lbuf and lbuf2. This way, we will know if
6528 * decompression fill exactly to lsize.
6529 */
6530 VERIFY0(random_get_pseudo_bytes(lbuf2, lsize));
6531
6532 if (zio_decompress_data(c, pabd,
6533 lbuf, psize, lsize) == 0 &&
6534 zio_decompress_data(c, pabd,
6535 lbuf2, psize, lsize) == 0 &&
6536 bcmp(lbuf, lbuf2, lsize) == 0)
6537 break;
6538 }
6539 if (c != ZIO_COMPRESS_FUNCTIONS)
6540 break;
6541 }
6542 umem_free(lbuf2, SPA_MAXBLOCKSIZE);
6543
6544 if (lsize > SPA_MAXBLOCKSIZE) {
6545 (void) printf("Decompress of %s failed\n", thing);
6546 goto out;
6547 }
6548 buf = lbuf;
6549 size = lsize;
6550 } else {
6551 size = psize;
6552 buf = abd_borrow_buf_copy(pabd, size);
6553 borrowed = B_TRUE;
6554 }
6555
6556 if (flags & ZDB_FLAG_PRINT_BLKPTR)
6557 zdb_print_blkptr((blkptr_t *)(void *)
6558 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
6559 else if (flags & ZDB_FLAG_RAW)
6560 zdb_dump_block_raw(buf, size, flags);
6561 else if (flags & ZDB_FLAG_INDIRECT)
6562 zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
6563 flags);
6564 else if (flags & ZDB_FLAG_GBH)
6565 zdb_dump_gbh(buf, flags);
6566 else
6567 zdb_dump_block(thing, buf, size, flags);
6568
6569 if (borrowed)
6570 abd_return_buf_copy(pabd, buf, size);
6571
6572 out:
6573 abd_free(pabd);
6574 umem_free(lbuf, SPA_MAXBLOCKSIZE);
6575 free(dup);
6576 }
6577
6578 static void
6579 zdb_embedded_block(char *thing)
6580 {
6581 blkptr_t bp;
6582 unsigned long long *words = (void *)&bp;
6583 char *buf;
6584 int err;
6585
6586 bzero(&bp, sizeof (bp));
6587 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
6588 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
6589 words + 0, words + 1, words + 2, words + 3,
6590 words + 4, words + 5, words + 6, words + 7,
6591 words + 8, words + 9, words + 10, words + 11,
6592 words + 12, words + 13, words + 14, words + 15);
6593 if (err != 16) {
6594 (void) fprintf(stderr, "invalid input format\n");
6595 exit(1);
6596 }
6597 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
6598 buf = malloc(SPA_MAXBLOCKSIZE);
6599 if (buf == NULL) {
6600 (void) fprintf(stderr, "out of memory\n");
6601 exit(1);
6602 }
6603 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
6604 if (err != 0) {
6605 (void) fprintf(stderr, "decode failed: %u\n", err);
6606 exit(1);
6607 }
6608 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
6609 free(buf);
6610 }
6611
6612 int
6613 main(int argc, char **argv)
6614 {
6615 int c;
6616 struct rlimit rl = { 1024, 1024 };
6617 spa_t *spa = NULL;
6618 objset_t *os = NULL;
6619 int dump_all = 1;
6620 int verbose = 0;
6621 int error = 0;
6622 char **searchdirs = NULL;
6623 int nsearch = 0;
6624 char *target, *target_pool;
6625 nvlist_t *policy = NULL;
6626 uint64_t max_txg = UINT64_MAX;
6627 int flags = ZFS_IMPORT_MISSING_LOG;
6628 int rewind = ZPOOL_NEVER_REWIND;
6629 char *spa_config_path_env;
6630 boolean_t target_is_spa = B_TRUE;
6631 nvlist_t *cfg = NULL;
6632
6633 (void) setrlimit(RLIMIT_NOFILE, &rl);
6634 (void) enable_extended_FILE_stdio(-1, -1);
6635
6636 dprintf_setup(&argc, argv);
6637
6638 /*
6639 * If there is an environment variable SPA_CONFIG_PATH it overrides
6640 * default spa_config_path setting. If -U flag is specified it will
6641 * override this environment variable settings once again.
6642 */
6643 spa_config_path_env = getenv("SPA_CONFIG_PATH");
6644 if (spa_config_path_env != NULL)
6645 spa_config_path = spa_config_path_env;
6646
6647 /*
6648 * For performance reasons, we set this tunable down. We do so before
6649 * the arg parsing section so that the user can override this value if
6650 * they choose.
6651 */
6652 zfs_btree_verify_intensity = 3;
6653
6654 while ((c = getopt(argc, argv,
6655 "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) {
6656 switch (c) {
6657 case 'b':
6658 case 'c':
6659 case 'C':
6660 case 'd':
6661 case 'D':
6662 case 'E':
6663 case 'G':
6664 case 'h':
6665 case 'i':
6666 case 'l':
6667 case 'm':
6668 case 'M':
6669 case 'O':
6670 case 'R':
6671 case 's':
6672 case 'S':
6673 case 'u':
6674 dump_opt[c]++;
6675 dump_all = 0;
6676 break;
6677 case 'A':
6678 case 'e':
6679 case 'F':
6680 case 'k':
6681 case 'L':
6682 case 'P':
6683 case 'q':
6684 case 'X':
6685 dump_opt[c]++;
6686 break;
6687 case 'Y':
6688 zfs_reconstruct_indirect_combinations_max = INT_MAX;
6689 zfs_deadman_enabled = 0;
6690 break;
6691 /* NB: Sort single match options below. */
6692 case 'I':
6693 max_inflight_bytes = strtoull(optarg, NULL, 0);
6694 if (max_inflight_bytes == 0) {
6695 (void) fprintf(stderr, "maximum number "
6696 "of inflight bytes must be greater "
6697 "than 0\n");
6698 usage();
6699 }
6700 break;
6701 case 'o':
6702 error = set_global_var(optarg);
6703 if (error != 0)
6704 usage();
6705 break;
6706 case 'p':
6707 if (searchdirs == NULL) {
6708 searchdirs = umem_alloc(sizeof (char *),
6709 UMEM_NOFAIL);
6710 } else {
6711 char **tmp = umem_alloc((nsearch + 1) *
6712 sizeof (char *), UMEM_NOFAIL);
6713 bcopy(searchdirs, tmp, nsearch *
6714 sizeof (char *));
6715 umem_free(searchdirs,
6716 nsearch * sizeof (char *));
6717 searchdirs = tmp;
6718 }
6719 searchdirs[nsearch++] = optarg;
6720 break;
6721 case 't':
6722 max_txg = strtoull(optarg, NULL, 0);
6723 if (max_txg < TXG_INITIAL) {
6724 (void) fprintf(stderr, "incorrect txg "
6725 "specified: %s\n", optarg);
6726 usage();
6727 }
6728 break;
6729 case 'U':
6730 spa_config_path = optarg;
6731 if (spa_config_path[0] != '/') {
6732 (void) fprintf(stderr,
6733 "cachefile must be an absolute path "
6734 "(i.e. start with a slash)\n");
6735 usage();
6736 }
6737 break;
6738 case 'v':
6739 verbose++;
6740 break;
6741 case 'V':
6742 flags = ZFS_IMPORT_VERBATIM;
6743 break;
6744 case 'x':
6745 vn_dumpdir = optarg;
6746 break;
6747 default:
6748 usage();
6749 break;
6750 }
6751 }
6752
6753 if (!dump_opt['e'] && searchdirs != NULL) {
6754 (void) fprintf(stderr, "-p option requires use of -e\n");
6755 usage();
6756 }
6757
6758 #if defined(_LP64)
6759 /*
6760 * ZDB does not typically re-read blocks; therefore limit the ARC
6761 * to 256 MB, which can be used entirely for metadata.
6762 */
6763 zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
6764 #endif
6765
6766 /*
6767 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
6768 * "zdb -b" uses traversal prefetch which uses async reads.
6769 * For good performance, let several of them be active at once.
6770 */
6771 zfs_vdev_async_read_max_active = 10;
6772
6773 /*
6774 * Disable reference tracking for better performance.
6775 */
6776 reference_tracking_enable = B_FALSE;
6777
6778 /*
6779 * Do not fail spa_load when spa_load_verify fails. This is needed
6780 * to load non-idle pools.
6781 */
6782 spa_load_verify_dryrun = B_TRUE;
6783
6784 kernel_init(SPA_MODE_READ);
6785
6786 if (dump_all)
6787 verbose = MAX(verbose, 1);
6788
6789 for (c = 0; c < 256; c++) {
6790 if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
6791 dump_opt[c] = 1;
6792 if (dump_opt[c])
6793 dump_opt[c] += verbose;
6794 }
6795
6796 aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
6797 zfs_recover = (dump_opt['A'] > 1);
6798
6799 argc -= optind;
6800 argv += optind;
6801
6802 if (argc < 2 && dump_opt['R'])
6803 usage();
6804
6805 if (dump_opt['E']) {
6806 if (argc != 1)
6807 usage();
6808 zdb_embedded_block(argv[0]);
6809 return (0);
6810 }
6811
6812 if (argc < 1) {
6813 if (!dump_opt['e'] && dump_opt['C']) {
6814 dump_cachefile(spa_config_path);
6815 return (0);
6816 }
6817 usage();
6818 }
6819
6820 if (dump_opt['l'])
6821 return (dump_label(argv[0]));
6822
6823 if (dump_opt['O']) {
6824 if (argc != 2)
6825 usage();
6826 dump_opt['v'] = verbose + 3;
6827 return (dump_path(argv[0], argv[1]));
6828 }
6829
6830 if (dump_opt['X'] || dump_opt['F'])
6831 rewind = ZPOOL_DO_REWIND |
6832 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
6833
6834 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
6835 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
6836 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
6837 fatal("internal error: %s", strerror(ENOMEM));
6838
6839 error = 0;
6840 target = argv[0];
6841
6842 if (strpbrk(target, "/@") != NULL) {
6843 size_t targetlen;
6844
6845 target_pool = strdup(target);
6846 *strpbrk(target_pool, "/@") = '\0';
6847
6848 target_is_spa = B_FALSE;
6849 targetlen = strlen(target);
6850 if (targetlen && target[targetlen - 1] == '/')
6851 target[targetlen - 1] = '\0';
6852 } else {
6853 target_pool = target;
6854 }
6855
6856 if (dump_opt['e']) {
6857 importargs_t args = { 0 };
6858
6859 args.paths = nsearch;
6860 args.path = searchdirs;
6861 args.can_be_active = B_TRUE;
6862
6863 error = zpool_find_config(NULL, target_pool, &cfg, &args,
6864 &libzpool_config_ops);
6865
6866 if (error == 0) {
6867
6868 if (nvlist_add_nvlist(cfg,
6869 ZPOOL_LOAD_POLICY, policy) != 0) {
6870 fatal("can't open '%s': %s",
6871 target, strerror(ENOMEM));
6872 }
6873
6874 if (dump_opt['C'] > 1) {
6875 (void) printf("\nConfiguration for import:\n");
6876 dump_nvlist(cfg, 8);
6877 }
6878
6879 /*
6880 * Disable the activity check to allow examination of
6881 * active pools.
6882 */
6883 error = spa_import(target_pool, cfg, NULL,
6884 flags | ZFS_IMPORT_SKIP_MMP);
6885 }
6886 }
6887
6888 /*
6889 * import_checkpointed_state makes the assumption that the
6890 * target pool that we pass it is already part of the spa
6891 * namespace. Because of that we need to make sure to call
6892 * it always after the -e option has been processed, which
6893 * imports the pool to the namespace if it's not in the
6894 * cachefile.
6895 */
6896 char *checkpoint_pool = NULL;
6897 char *checkpoint_target = NULL;
6898 if (dump_opt['k']) {
6899 checkpoint_pool = import_checkpointed_state(target, cfg,
6900 &checkpoint_target);
6901
6902 if (checkpoint_target != NULL)
6903 target = checkpoint_target;
6904 }
6905
6906 if (target_pool != target)
6907 free(target_pool);
6908
6909 if (error == 0) {
6910 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
6911 ASSERT(checkpoint_pool != NULL);
6912 ASSERT(checkpoint_target == NULL);
6913
6914 error = spa_open(checkpoint_pool, &spa, FTAG);
6915 if (error != 0) {
6916 fatal("Tried to open pool \"%s\" but "
6917 "spa_open() failed with error %d\n",
6918 checkpoint_pool, error);
6919 }
6920
6921 } else if (target_is_spa || dump_opt['R']) {
6922 zdb_set_skip_mmp(target);
6923 error = spa_open_rewind(target, &spa, FTAG, policy,
6924 NULL);
6925 if (error) {
6926 /*
6927 * If we're missing the log device then
6928 * try opening the pool after clearing the
6929 * log state.
6930 */
6931 mutex_enter(&spa_namespace_lock);
6932 if ((spa = spa_lookup(target)) != NULL &&
6933 spa->spa_log_state == SPA_LOG_MISSING) {
6934 spa->spa_log_state = SPA_LOG_CLEAR;
6935 error = 0;
6936 }
6937 mutex_exit(&spa_namespace_lock);
6938
6939 if (!error) {
6940 error = spa_open_rewind(target, &spa,
6941 FTAG, policy, NULL);
6942 }
6943 }
6944 } else if (strpbrk(target, "#") != NULL) {
6945 dsl_pool_t *dp;
6946 error = dsl_pool_hold(target, FTAG, &dp);
6947 if (error != 0) {
6948 fatal("can't dump '%s': %s", target,
6949 strerror(error));
6950 }
6951 error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
6952 dsl_pool_rele(dp, FTAG);
6953 if (error != 0) {
6954 fatal("can't dump '%s': %s", target,
6955 strerror(error));
6956 }
6957 return (error);
6958 } else {
6959 zdb_set_skip_mmp(target);
6960 error = open_objset(target, FTAG, &os);
6961 if (error == 0)
6962 spa = dmu_objset_spa(os);
6963 }
6964 }
6965 nvlist_free(policy);
6966
6967 if (error)
6968 fatal("can't open '%s': %s", target, strerror(error));
6969
6970 /*
6971 * Set the pool failure mode to panic in order to prevent the pool
6972 * from suspending. A suspended I/O will have no way to resume and
6973 * can prevent the zdb(8) command from terminating as expected.
6974 */
6975 if (spa != NULL)
6976 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
6977
6978 argv++;
6979 argc--;
6980 if (!dump_opt['R']) {
6981 if (argc > 0) {
6982 zopt_objects = argc;
6983 zopt_object = calloc(zopt_objects, sizeof (uint64_t));
6984 for (unsigned i = 0; i < zopt_objects; i++) {
6985 errno = 0;
6986 zopt_object[i] = strtoull(argv[i], NULL, 0);
6987 if (zopt_object[i] == 0 && errno != 0)
6988 fatal("bad number %s: %s",
6989 argv[i], strerror(errno));
6990 }
6991 }
6992 if (os != NULL) {
6993 dump_objset(os);
6994 } else if (zopt_objects > 0 && !dump_opt['m']) {
6995 dump_objset(spa->spa_meta_objset);
6996 } else {
6997 dump_zpool(spa);
6998 }
6999 } else {
7000 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
7001 flagbits['c'] = ZDB_FLAG_CHECKSUM;
7002 flagbits['d'] = ZDB_FLAG_DECOMPRESS;
7003 flagbits['e'] = ZDB_FLAG_BSWAP;
7004 flagbits['g'] = ZDB_FLAG_GBH;
7005 flagbits['i'] = ZDB_FLAG_INDIRECT;
7006 flagbits['p'] = ZDB_FLAG_PHYS;
7007 flagbits['r'] = ZDB_FLAG_RAW;
7008
7009 for (int i = 0; i < argc; i++)
7010 zdb_read_block(argv[i], spa);
7011 }
7012
7013 if (dump_opt['k']) {
7014 free(checkpoint_pool);
7015 if (!target_is_spa)
7016 free(checkpoint_target);
7017 }
7018
7019 if (os != NULL) {
7020 close_objset(os, FTAG);
7021 } else {
7022 spa_close(spa, FTAG);
7023 }
7024
7025 fuid_table_destroy();
7026
7027 dump_debug_buffer();
7028
7029 kernel_fini();
7030
7031 return (error);
7032 }