]>
git.proxmox.com Git - mirror_zfs.git/blob - tests/zfs-tests/cmd/draid/draid.c
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2018 Intel Corporation.
23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
28 #include <zfs_fletcher.h>
29 #include <sys/vdev_draid.h>
30 #include <sys/nvpair.h>
34 * The number of rows to generate for new permutation maps.
36 #define MAP_ROWS_DEFAULT 256
39 * Key values for dRAID maps when stored as nvlists.
41 #define MAP_SEED "seed"
42 #define MAP_CHECKSUM "checksum"
43 #define MAP_WORST_RATIO "worst_ratio"
44 #define MAP_AVG_RATIO "avg_ratio"
45 #define MAP_CHILDREN "children"
46 #define MAP_NPERMS "nperms"
47 #define MAP_PERMS "perms"
52 (void) fprintf(stderr
,
53 "usage: draid command args ...\n"
54 "Available commands are:\n"
56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
57 "\tdraid verify [-rv] FILE\n"
58 "\tdraid dump [-v] [-m min] [-n max] FILE\n"
59 "\tdraid table FILE\n"
60 "\tdraid merge FILE SRC SRC...\n");
65 read_map(const char *filename
, nvlist_t
**allcfgs
)
67 int block_size
= 131072;
68 int buf_size
= 131072;
73 if (lstat64(filename
, &stat
) != 0)
76 if (stat
.st_size
== 0 ||
77 !(S_ISREG(stat
.st_mode
) || S_ISLNK(stat
.st_mode
))) {
81 gzFile fp
= gzopen(filename
, "rb");
85 char *buf
= malloc(buf_size
);
91 ssize_t rc
, bytes
= 0;
93 rc
= gzread(fp
, buf
+ bytes
, block_size
);
94 if ((rc
< 0) || (rc
== 0 && !gzeof(fp
))) {
97 (void) gzerror(fp
, &error
);
102 if (bytes
+ block_size
>= buf_size
) {
103 tmp_size
= 2 * buf_size
;
104 tmp_buf
= malloc(tmp_size
);
105 if (tmp_buf
== NULL
) {
111 memcpy(tmp_buf
, buf
, bytes
);
121 error
= nvlist_unpack(buf
, bytes
, allcfgs
, 0);
128 * Read a map from the specified filename. A file contains multiple maps
129 * which are indexed by the number of children. The caller is responsible
130 * for freeing the configuration returned.
133 read_map_key(const char *filename
, char *key
, nvlist_t
**cfg
)
135 nvlist_t
*allcfgs
, *foundcfg
= NULL
;
138 error
= read_map(filename
, &allcfgs
);
142 nvlist_lookup_nvlist(allcfgs
, key
, &foundcfg
);
143 if (foundcfg
!= NULL
) {
144 nvlist_dup(foundcfg
, cfg
, KM_SLEEP
);
150 nvlist_free(allcfgs
);
156 * Write all mappings to the map file.
159 write_map(const char *filename
, nvlist_t
*allcfgs
)
164 error
= nvlist_size(allcfgs
, &buflen
, NV_ENCODE_XDR
);
168 char *buf
= malloc(buflen
);
172 error
= nvlist_pack(allcfgs
, &buf
, &buflen
, NV_ENCODE_XDR
, KM_SLEEP
);
179 * Atomically update the file using a temporary file and the
180 * traditional unlink then rename steps. This code provides
181 * no locking, it only guarantees the packed nvlist on disk
182 * is updated atomically and is internally consistent.
184 char *tmpname
= calloc(MAXPATHLEN
, 1);
185 if (tmpname
== NULL
) {
190 snprintf(tmpname
, MAXPATHLEN
- 1, "%s.XXXXXX", filename
);
192 int fd
= mkstemp(tmpname
);
201 gzFile fp
= gzopen(tmpname
, "w9b");
209 ssize_t rc
, bytes
= 0;
210 while (bytes
< buflen
) {
211 size_t size
= MIN(buflen
- bytes
, 131072);
212 rc
= gzwrite(fp
, buf
+ bytes
, size
);
215 (void) gzerror(fp
, &error
);
217 (void) unlink(tmpname
);
220 } else if (rc
== 0) {
230 if (bytes
!= buflen
) {
231 (void) unlink(tmpname
);
237 * Unlink the previous config file and replace it with the updated
238 * version. If we're able to unlink the file then directory is
239 * writable by us and the subsequent rename should never fail.
241 error
= unlink(filename
);
242 if (error
!= 0 && errno
!= ENOENT
) {
244 (void) unlink(tmpname
);
249 error
= rename(tmpname
, filename
);
252 (void) unlink(tmpname
);
263 * Add the dRAID map to the file and write it out.
266 write_map_key(const char *filename
, char *key
, draid_map_t
*map
,
267 double worst_ratio
, double avg_ratio
)
269 nvlist_t
*nv_cfg
, *allcfgs
;
273 * Add the configuration to an existing or new file. The new
274 * configuration will replace an existing configuration with the
275 * same key if it has a lower ratio and is therefore better.
277 error
= read_map(filename
, &allcfgs
);
278 if (error
== ENOENT
) {
279 allcfgs
= fnvlist_alloc();
280 } else if (error
!= 0) {
284 error
= nvlist_lookup_nvlist(allcfgs
, key
, &nv_cfg
);
286 uint64_t nv_cfg_worst_ratio
= fnvlist_lookup_uint64(nv_cfg
,
288 double nv_worst_ratio
= (double)nv_cfg_worst_ratio
/ 1000.0;
290 if (worst_ratio
< nv_worst_ratio
) {
291 /* Replace old map with the more balanced new map. */
292 fnvlist_remove(allcfgs
, key
);
294 /* The old map is preferable, keep it. */
295 nvlist_free(allcfgs
);
300 nvlist_t
*cfg
= fnvlist_alloc();
301 fnvlist_add_uint64(cfg
, MAP_SEED
, map
->dm_seed
);
302 fnvlist_add_uint64(cfg
, MAP_CHECKSUM
, map
->dm_checksum
);
303 fnvlist_add_uint64(cfg
, MAP_CHILDREN
, map
->dm_children
);
304 fnvlist_add_uint64(cfg
, MAP_NPERMS
, map
->dm_nperms
);
305 fnvlist_add_uint8_array(cfg
, MAP_PERMS
, map
->dm_perms
,
306 map
->dm_children
* map
->dm_nperms
* sizeof (uint8_t));
308 fnvlist_add_uint64(cfg
, MAP_WORST_RATIO
,
309 (uint64_t)(worst_ratio
* 1000.0));
310 fnvlist_add_uint64(cfg
, MAP_AVG_RATIO
,
311 (uint64_t)(avg_ratio
* 1000.0));
313 error
= nvlist_add_nvlist(allcfgs
, key
, cfg
);
315 error
= write_map(filename
, allcfgs
);
318 nvlist_free(allcfgs
);
323 dump_map(draid_map_t
*map
, char *key
, double worst_ratio
, double avg_ratio
,
328 } else if (verbose
== 1) {
329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
330 "avg_ratio: %2.03f\n", key
, (u_longlong_t
)map
->dm_seed
,
331 worst_ratio
, avg_ratio
);
336 " checksum: 0x%016llx\n"
337 " worst_ratio: %2.03f\n"
338 " avg_ratio: %2.03f\n"
341 key
, (u_longlong_t
)map
->dm_seed
,
342 (u_longlong_t
)map
->dm_checksum
, worst_ratio
, avg_ratio
,
343 (u_longlong_t
)map
->dm_children
,
344 (u_longlong_t
)map
->dm_nperms
);
347 printf(" perms = {\n");
348 for (int i
= 0; i
< map
->dm_nperms
; i
++) {
350 for (int j
= 0; j
< map
->dm_children
; j
++) {
351 printf("%3d%s ", map
->dm_perms
[
352 i
* map
->dm_children
+ j
],
353 j
< map
->dm_children
- 1 ?
359 } else if (verbose
== 2) {
360 printf(" draid_perms = <omitted>\n");
366 dump_map_nv(char *key
, nvlist_t
*cfg
, int verbose
)
371 uint64_t worst_ratio
= fnvlist_lookup_uint64(cfg
, MAP_WORST_RATIO
);
372 uint64_t avg_ratio
= fnvlist_lookup_uint64(cfg
, MAP_AVG_RATIO
);
374 map
.dm_seed
= fnvlist_lookup_uint64(cfg
, MAP_SEED
);
375 map
.dm_checksum
= fnvlist_lookup_uint64(cfg
, MAP_CHECKSUM
);
376 map
.dm_children
= fnvlist_lookup_uint64(cfg
, MAP_CHILDREN
);
377 map
.dm_nperms
= fnvlist_lookup_uint64(cfg
, MAP_NPERMS
);
378 nvlist_lookup_uint8_array(cfg
, MAP_PERMS
, &map
.dm_perms
, &c
);
380 dump_map(&map
, key
, (double)worst_ratio
/ 1000.0,
381 avg_ratio
/ 1000.0, verbose
);
385 * Print a summary of the mapping.
388 dump_map_key(const char *filename
, char *key
, int verbose
)
393 error
= read_map_key(filename
, key
, &cfg
);
397 dump_map_nv(key
, cfg
, verbose
);
403 * Allocate a new permutation map for evaluation.
406 alloc_new_map(uint64_t children
, uint64_t nperms
, uint64_t seed
,
412 map
= malloc(sizeof (draid_map_t
));
416 map
->dm_children
= children
;
417 map
->dm_nperms
= nperms
;
419 map
->dm_checksum
= 0;
421 error
= vdev_draid_generate_perms(map
, &map
->dm_perms
);
433 * Allocate the fixed permutation map for N children.
436 alloc_fixed_map(uint64_t children
, draid_map_t
**mapp
)
438 const draid_map_t
*fixed_map
;
442 error
= vdev_draid_lookup_map(children
, &fixed_map
);
446 map
= malloc(sizeof (draid_map_t
));
450 memcpy(map
, fixed_map
, sizeof (draid_map_t
));
451 VERIFY3U(map
->dm_checksum
, !=, 0);
453 error
= vdev_draid_generate_perms(map
, &map
->dm_perms
);
465 * Free a permutation map.
468 free_map(draid_map_t
*map
)
475 * Check if dev is in the provided list of faulted devices.
477 static inline boolean_t
478 is_faulted(int *faulted_devs
, int nfaulted
, int dev
)
480 for (int i
= 0; i
< nfaulted
; i
++)
481 if (faulted_devs
[i
] == dev
)
488 * Evaluate how resilvering I/O will be distributed given a list of faulted
489 * vdevs. As a simplification we assume one IO is sufficient to repair each
490 * damaged device in a group.
493 eval_resilver(draid_map_t
*map
, uint64_t groupwidth
, uint64_t nspares
,
494 int *faulted_devs
, int nfaulted
, int *min_child_ios
, int *max_child_ios
)
496 uint64_t children
= map
->dm_children
;
497 uint64_t ngroups
= 1;
498 uint64_t ndisks
= children
- nspares
;
501 * Calculate the minimum number of groups required to fill a slice.
503 while (ngroups
* (groupwidth
) % (children
- nspares
) != 0)
506 int *ios
= calloc(map
->dm_children
, sizeof (uint64_t));
508 /* Resilver all rows */
509 for (int i
= 0; i
< map
->dm_nperms
; i
++) {
510 uint8_t *row
= &map
->dm_perms
[i
* map
->dm_children
];
512 /* Resilver all groups with faulted drives */
513 for (int j
= 0; j
< ngroups
; j
++) {
514 uint64_t spareidx
= map
->dm_children
- nspares
;
515 boolean_t repair_needed
= B_FALSE
;
517 /* See if any devices in this group are faulted */
518 uint64_t groupstart
= (j
* groupwidth
) % ndisks
;
520 for (int k
= 0; k
< groupwidth
; k
++) {
521 uint64_t groupidx
= (groupstart
+ k
) % ndisks
;
523 repair_needed
= is_faulted(faulted_devs
,
524 nfaulted
, row
[groupidx
]);
529 if (repair_needed
== B_FALSE
)
533 * This group is degraded. Calculate the number of
534 * reads the non-faulted drives require and the number
535 * of writes to the distributed hot spare for this row.
537 for (int k
= 0; k
< groupwidth
; k
++) {
538 uint64_t groupidx
= (groupstart
+ k
) % ndisks
;
540 if (!is_faulted(faulted_devs
, nfaulted
,
542 ios
[row
[groupidx
]]++;
543 } else if (nspares
> 0) {
544 while (is_faulted(faulted_devs
,
545 nfaulted
, row
[spareidx
])) {
549 ASSERT3U(spareidx
, <, map
->dm_children
);
550 ios
[row
[spareidx
]]++;
557 *min_child_ios
= INT_MAX
;
561 * Find the drives with fewest and most required I/O. These values
562 * are used to calculate the imbalance ratio. To avoid returning an
563 * infinite value for permutations which have children that perform
564 * no IO a floor of 1 IO per child is set. This ensures a meaningful
565 * ratio is returned for comparison and it is not an uncommon when
566 * there are a large number of children.
568 for (int i
= 0; i
< map
->dm_children
; i
++) {
570 if (is_faulted(faulted_devs
, nfaulted
, i
)) {
578 if (ios
[i
] < *min_child_ios
)
579 *min_child_ios
= ios
[i
];
581 if (ios
[i
] > *max_child_ios
)
582 *max_child_ios
= ios
[i
];
585 ASSERT3S(*min_child_ios
, !=, INT_MAX
);
586 ASSERT3S(*max_child_ios
, !=, 0);
588 double ratio
= (double)(*max_child_ios
) / (double)(*min_child_ios
);
596 * Evaluate the quality of the permutation mapping by considering possible
597 * device failures. Returns the imbalance ratio for the worst mapping which
598 * is defined to be the largest number of child IOs over the fewest number
599 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
600 * all children perform an equal amount of work during reconstruction.
603 eval_decluster(draid_map_t
*map
, double *worst_ratiop
, double *avg_ratiop
)
605 uint64_t children
= map
->dm_children
;
606 double worst_ratio
= 1.0;
608 int worst_min_ios
= 0, worst_max_ios
= 0;
612 * When there are only 2 children there can be no distributed
613 * spare and no resilver to evaluate. Default to a ratio of 1.0
614 * for this degenerate case.
616 if (children
== VDEV_DRAID_MIN_CHILDREN
) {
623 * Score the mapping as if it had either 1 or 2 distributed spares.
625 for (int nspares
= 1; nspares
<= 2; nspares
++) {
626 uint64_t faults
= nspares
;
629 * Score groupwidths up to 19. This value was choosen as the
630 * largest reasonable width (16d+3p). dRAID pools may be still
631 * be created with wider stripes but they are not considered in
632 * this analysis in order to optimize for the most common cases.
634 for (uint64_t groupwidth
= 2;
635 groupwidth
<= MIN(children
- nspares
, 19);
638 int min_ios
, max_ios
;
641 * Score possible devices faults. This is limited
642 * to exactly one fault per distributed spare for
643 * the purposes of this similation.
645 for (int f1
= 0; f1
< children
; f1
++) {
646 faulted_devs
[0] = f1
;
650 ratio
= eval_resilver(map
, groupwidth
,
651 nspares
, faulted_devs
, faults
,
654 if (ratio
> worst_ratio
) {
656 worst_min_ios
= min_ios
;
657 worst_max_ios
= max_ios
;
662 } else if (faults
== 2) {
663 for (int f2
= f1
+ 1; f2
< children
;
665 faulted_devs
[1] = f2
;
667 ratio
= eval_resilver(map
,
669 faulted_devs
, faults
,
672 if (ratio
> worst_ratio
) {
674 worst_min_ios
= min_ios
;
675 worst_max_ios
= max_ios
;
686 *worst_ratiop
= worst_ratio
;
687 *avg_ratiop
= sum
/ n
;
690 * Log the min/max io values for particularly unbalanced maps.
691 * Since the maps are generated entirely randomly these are possible
692 * be exceedingly unlikely. We log it for possible investigation.
694 if (worst_ratio
> 100.0) {
695 dump_map(map
, "DEBUG", worst_ratio
, *avg_ratiop
, 2);
696 printf("worst_min_ios=%d worst_max_ios=%d\n",
697 worst_min_ios
, worst_max_ios
);
702 eval_maps(uint64_t children
, int passes
, uint64_t *map_seed
,
703 draid_map_t
**best_mapp
, double *best_ratiop
, double *avg_ratiop
)
705 draid_map_t
*best_map
= NULL
;
706 double best_worst_ratio
= 1000.0;
707 double best_avg_ratio
= 1000.0;
710 * Perform the requested number of passes evaluating randomly
711 * generated permutation maps. Only the best version is kept.
713 for (int i
= 0; i
< passes
; i
++) {
714 double worst_ratio
, avg_ratio
;
719 * Calculate the next seed and generate a new candidate map.
721 error
= alloc_new_map(children
, MAP_ROWS_DEFAULT
,
722 vdev_draid_rand(map_seed
), &map
);
727 * Consider maps with a lower worst_ratio to be of higher
728 * quality. Some maps may have a lower avg_ratio but they
729 * are discarded since they might include some particularly
730 * imbalanced permuations. The average is tracked to in
731 * order to get a sense of the average permutation quality.
733 eval_decluster(map
, &worst_ratio
, &avg_ratio
);
735 if (best_map
== NULL
|| worst_ratio
< best_worst_ratio
) {
737 if (best_map
!= NULL
)
741 best_worst_ratio
= worst_ratio
;
742 best_avg_ratio
= avg_ratio
;
749 * After determining the best map generate a checksum over the full
750 * permutation array. This checksum is verified when opening a dRAID
751 * pool to ensure the generated in memory permutations are correct.
754 fletcher_4_native_varsize(best_map
->dm_perms
,
755 sizeof (uint8_t) * best_map
->dm_children
* best_map
->dm_nperms
,
757 best_map
->dm_checksum
= cksum
.zc_word
[0];
759 *best_mapp
= best_map
;
760 *best_ratiop
= best_worst_ratio
;
761 *avg_ratiop
= best_avg_ratio
;
767 draid_generate(int argc
, char *argv
[])
769 char filename
[MAXPATHLEN
];
771 int c
, fd
, error
, verbose
= 0, passes
= 1, continuous
= 0;
772 int min_children
= VDEV_DRAID_MIN_CHILDREN
;
773 int max_children
= VDEV_DRAID_MAX_CHILDREN
;
776 while ((c
= getopt(argc
, argv
, ":cm:n:p:v")) != -1) {
782 min_children
= (int)strtol(optarg
, NULL
, 0);
783 if (min_children
< VDEV_DRAID_MIN_CHILDREN
) {
784 (void) fprintf(stderr
, "A minimum of 2 "
785 "children are required.\n");
791 max_children
= (int)strtol(optarg
, NULL
, 0);
792 if (max_children
> VDEV_DRAID_MAX_CHILDREN
) {
793 (void) fprintf(stderr
, "A maximum of %d "
794 "children are allowed.\n",
795 VDEV_DRAID_MAX_CHILDREN
);
800 passes
= (int)strtol(optarg
, NULL
, 0);
804 * 0 - Only log when a better map is added to the file.
805 * 1 - Log the current best map for each child count.
806 * Minimal output on a single summary line.
807 * 2 - Log the current best map for each child count.
808 * More verbose includes most map fields.
809 * 3 - Log the current best map for each child count.
810 * Very verbose all fields including the full map.
815 (void) fprintf(stderr
,
816 "missing argument for '%c' option\n", optopt
);
820 (void) fprintf(stderr
, "invalid option '%c'\n",
828 bzero(filename
, MAXPATHLEN
);
829 strncpy(filename
, argv
[optind
], MAXPATHLEN
- 1);
831 (void) fprintf(stderr
, "A FILE must be specified.\n");
837 * Start with a fresh seed from /dev/urandom.
839 fd
= open("/dev/urandom", O_RDONLY
);
841 printf("Unable to open /dev/urandom: %s\n:", strerror(errno
));
844 ssize_t bytes
= sizeof (map_seed
);
845 ssize_t bytes_read
= 0;
847 while (bytes_read
< bytes
) {
848 ssize_t rc
= read(fd
, ((char *)&map_seed
) + bytes_read
,
851 printf("Unable to read /dev/urandom: %s\n:",
862 printf("Writing generated mappings to '%s':\n", filename
);
865 * Generate maps for all requested child counts. The best map for
866 * each child count is written out to the specified file. If the file
867 * already contains a better mapping this map will not be added.
869 for (uint64_t children
= min_children
;
870 children
<= max_children
; children
++) {
873 double worst_ratio
= 1000.0;
874 double avg_ratio
= 1000.0;
876 error
= eval_maps(children
, passes
, &map_seed
, &map
,
877 &worst_ratio
, &avg_ratio
);
879 printf("Error eval_maps(): %s\n", strerror(error
));
883 if (worst_ratio
< 1.0 || avg_ratio
< 1.0) {
884 printf("Error ratio < 1.0: worst_ratio = %2.03f "
885 "avg_ratio = %2.03f\n", worst_ratio
, avg_ratio
);
889 snprintf(key
, 7, "%llu", (u_longlong_t
)children
);
890 error
= write_map_key(filename
, key
, map
, worst_ratio
,
893 /* The new map was added to the file. */
894 dump_map(map
, key
, worst_ratio
, avg_ratio
,
896 } else if (error
== EEXIST
) {
897 /* The existing map was preferable and kept. */
899 dump_map_key(filename
, key
, verbose
);
901 printf("Error write_map_key(): %s\n", strerror(error
));
909 * When the continuous option is set restart at the minimum number of
910 * children instead of exiting. This option is useful as a mechanism
911 * to continuous try and refine the discovered permutations.
915 printf("Restarting by request (-c): %d\n", restarts
);
923 * Verify each map in the file by generating its in-memory permutation array
924 * and comfirming its checksum is correct.
927 draid_verify(int argc
, char *argv
[])
929 char filename
[MAXPATHLEN
];
930 int n
= 0, c
, error
, verbose
= 1;
931 int check_ratios
= 0;
933 while ((c
= getopt(argc
, argv
, ":rv")) != -1) {
942 (void) fprintf(stderr
,
943 "missing argument for '%c' option\n", optopt
);
947 (void) fprintf(stderr
, "invalid option '%c'\n",
955 char *abspath
= malloc(MAXPATHLEN
);
959 bzero(filename
, MAXPATHLEN
);
960 if (realpath(argv
[optind
], abspath
) != NULL
)
961 strncpy(filename
, abspath
, MAXPATHLEN
- 1);
963 strncpy(filename
, argv
[optind
], MAXPATHLEN
- 1);
967 (void) fprintf(stderr
, "A FILE must be specified.\n");
971 printf("Verifying permutation maps: '%s'\n", filename
);
974 * Lookup hardcoded permutation map for each valid number of children
975 * and verify a generated map has the correct checksum. Then compare
976 * the generated map values with the nvlist map values read from the
977 * reference file to cross-check the permutation.
979 for (uint64_t children
= VDEV_DRAID_MIN_CHILDREN
;
980 children
<= VDEV_DRAID_MAX_CHILDREN
;
986 snprintf(key
, 8, "%llu", (u_longlong_t
)children
);
988 error
= alloc_fixed_map(children
, &map
);
990 printf("Error alloc_fixed_map() failed: %s\n",
991 error
== ECKSUM
? "Invalid checksum" :
996 uint64_t nv_seed
, nv_checksum
, nv_children
, nv_nperms
;
1001 error
= read_map_key(filename
, key
, &cfg
);
1003 printf("Error read_map_key() failed: %s\n",
1009 nv_seed
= fnvlist_lookup_uint64(cfg
, MAP_SEED
);
1010 nv_checksum
= fnvlist_lookup_uint64(cfg
, MAP_CHECKSUM
);
1011 nv_children
= fnvlist_lookup_uint64(cfg
, MAP_CHILDREN
);
1012 nv_nperms
= fnvlist_lookup_uint64(cfg
, MAP_NPERMS
);
1013 nvlist_lookup_uint8_array(cfg
, MAP_PERMS
, &nv_perms
, &c
);
1016 * Compare draid_map_t and nvlist reference values.
1018 if (map
->dm_seed
!= nv_seed
) {
1019 printf("Error different seeds: 0x%016llx != "
1020 "0x%016llx\n", (u_longlong_t
)map
->dm_seed
,
1021 (u_longlong_t
)nv_seed
);
1025 if (map
->dm_checksum
!= nv_checksum
) {
1026 printf("Error different checksums: 0x%016llx "
1028 (u_longlong_t
)map
->dm_checksum
,
1029 (u_longlong_t
)nv_checksum
);
1033 if (map
->dm_children
!= nv_children
) {
1034 printf("Error different children: %llu "
1035 "!= %llu\n", (u_longlong_t
)map
->dm_children
,
1036 (u_longlong_t
)nv_children
);
1040 if (map
->dm_nperms
!= nv_nperms
) {
1041 printf("Error different nperms: %llu "
1042 "!= %llu\n", (u_longlong_t
)map
->dm_nperms
,
1043 (u_longlong_t
)nv_nperms
);
1047 for (uint64_t i
= 0; i
< nv_children
* nv_nperms
; i
++) {
1048 if (map
->dm_perms
[i
] != nv_perms
[i
]) {
1049 printf("Error different perms[%llu]: "
1050 "%d != %d\n", (u_longlong_t
)i
,
1051 (int)map
->dm_perms
[i
],
1059 * For good measure recalculate the worst and average
1060 * ratios and confirm they match the nvlist values.
1063 uint64_t nv_worst_ratio
, nv_avg_ratio
;
1064 double worst_ratio
, avg_ratio
;
1066 eval_decluster(map
, &worst_ratio
, &avg_ratio
);
1068 nv_worst_ratio
= fnvlist_lookup_uint64(cfg
,
1070 nv_avg_ratio
= fnvlist_lookup_uint64(cfg
,
1073 if (worst_ratio
< 1.0 || avg_ratio
< 1.0) {
1074 printf("Error ratio out of range %2.03f, "
1075 "%2.03f\n", worst_ratio
, avg_ratio
);
1079 if ((uint64_t)(worst_ratio
* 1000.0) !=
1081 printf("Error different worst_ratio %2.03f "
1082 "!= %2.03f\n", (double)nv_worst_ratio
/
1083 1000.0, worst_ratio
);
1087 if ((uint64_t)(avg_ratio
* 1000.0) != nv_avg_ratio
) {
1088 printf("Error different average_ratio %2.03f "
1089 "!= %2.03f\n", (double)nv_avg_ratio
/
1102 printf("- %llu children: good\n",
1103 (u_longlong_t
)children
);
1111 if (n
!= (VDEV_DRAID_MAX_CHILDREN
- 1)) {
1112 printf("Error permutation maps missing: %d / %d checked\n",
1113 n
, VDEV_DRAID_MAX_CHILDREN
- 1);
1117 printf("Successfully verified %d / %d permutation maps\n",
1118 n
, VDEV_DRAID_MAX_CHILDREN
- 1);
1124 * Dump the contents of the specified mapping(s) for inspection.
1127 draid_dump(int argc
, char *argv
[])
1129 char filename
[MAXPATHLEN
];
1130 int c
, error
, verbose
= 1;
1131 int min_children
= VDEV_DRAID_MIN_CHILDREN
;
1132 int max_children
= VDEV_DRAID_MAX_CHILDREN
;
1134 while ((c
= getopt(argc
, argv
, ":vm:n:")) != -1) {
1137 min_children
= (int)strtol(optarg
, NULL
, 0);
1138 if (min_children
< 2) {
1139 (void) fprintf(stderr
, "A minimum of 2 "
1140 "children are required.\n");
1146 max_children
= (int)strtol(optarg
, NULL
, 0);
1147 if (max_children
> VDEV_DRAID_MAX_CHILDREN
) {
1148 (void) fprintf(stderr
, "A maximum of %d "
1149 "children are allowed.\n",
1150 VDEV_DRAID_MAX_CHILDREN
);
1158 (void) fprintf(stderr
,
1159 "missing argument for '%c' option\n", optopt
);
1163 (void) fprintf(stderr
, "invalid option '%c'\n",
1170 if (argc
> optind
) {
1171 bzero(filename
, MAXPATHLEN
);
1172 strncpy(filename
, argv
[optind
], MAXPATHLEN
- 1);
1174 (void) fprintf(stderr
, "A FILE must be specified.\n");
1179 * Dump maps for the requested child counts.
1181 for (uint64_t children
= min_children
;
1182 children
<= max_children
; children
++) {
1183 char key
[8] = { 0 };
1185 snprintf(key
, 7, "%llu", (u_longlong_t
)children
);
1186 error
= dump_map_key(filename
, key
, verbose
);
1188 printf("Error dump_map_key(): %s\n", strerror(error
));
1197 * Print all of the mappings as a C formated draid_map_t array. This table
1198 * is found in the module/zcommon/zfs_draid.c file and is the definative
1199 * source for all mapping used by dRAID. It cannot be updated without
1200 * changing the dRAID on disk format.
1203 draid_table(int argc
, char *argv
[])
1205 char filename
[MAXPATHLEN
];
1208 if (argc
> optind
) {
1209 bzero(filename
, MAXPATHLEN
);
1210 strncpy(filename
, argv
[optind
], MAXPATHLEN
- 1);
1212 (void) fprintf(stderr
, "A FILE must be specified.\n");
1216 printf("static const draid_map_t "
1217 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1219 for (uint64_t children
= VDEV_DRAID_MIN_CHILDREN
;
1220 children
<= VDEV_DRAID_MAX_CHILDREN
;
1222 uint64_t seed
, checksum
, nperms
, avg_ratio
;
1227 snprintf(key
, 8, "%llu", (u_longlong_t
)children
);
1229 error
= read_map_key(filename
, key
, &cfg
);
1231 printf("Error read_map_key() failed: %s\n",
1236 seed
= fnvlist_lookup_uint64(cfg
, MAP_SEED
);
1237 checksum
= fnvlist_lookup_uint64(cfg
, MAP_CHECKSUM
);
1238 children
= fnvlist_lookup_uint64(cfg
, MAP_CHILDREN
);
1239 nperms
= fnvlist_lookup_uint64(cfg
, MAP_NPERMS
);
1240 avg_ratio
= fnvlist_lookup_uint64(cfg
, MAP_AVG_RATIO
);
1242 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1243 "/* %2.03f */\n", (u_longlong_t
)children
,
1244 (u_longlong_t
)nperms
, (u_longlong_t
)seed
,
1245 (u_longlong_t
)checksum
, (double)avg_ratio
/ 1000.0);
1256 draid_merge_impl(nvlist_t
*allcfgs
, const char *srcfilename
, int *mergedp
)
1259 nvpair_t
*elem
= NULL
;
1260 int error
, merged
= 0;
1262 error
= read_map(srcfilename
, &srccfgs
);
1266 while ((elem
= nvlist_next_nvpair(srccfgs
, elem
)) != NULL
) {
1267 uint64_t nv_worst_ratio
;
1268 uint64_t allcfg_worst_ratio
;
1269 nvlist_t
*cfg
, *allcfg
;
1272 switch (nvpair_type(elem
)) {
1273 case DATA_TYPE_NVLIST
:
1275 (void) nvpair_value_nvlist(elem
, &cfg
);
1276 key
= nvpair_name(elem
);
1278 nv_worst_ratio
= fnvlist_lookup_uint64(cfg
,
1281 error
= nvlist_lookup_nvlist(allcfgs
, key
, &allcfg
);
1283 allcfg_worst_ratio
= fnvlist_lookup_uint64(
1284 allcfg
, MAP_WORST_RATIO
);
1286 if (nv_worst_ratio
< allcfg_worst_ratio
) {
1287 fnvlist_remove(allcfgs
, key
);
1288 error
= nvlist_add_nvlist(allcfgs
,
1292 } else if (error
== ENOENT
) {
1293 error
= nvlist_add_nvlist(allcfgs
, key
, cfg
);
1305 nvlist_free(srccfgs
);
1313 * Merge the best map for each child count found in the listed files into
1314 * a new file. This allows 'draid generate' to be run in parallel and for
1315 * the results maps to be combined.
1318 draid_merge(int argc
, char *argv
[])
1320 char filename
[MAXPATHLEN
];
1321 int c
, error
, total_merged
= 0, verbose
= 0;
1324 while ((c
= getopt(argc
, argv
, ":v")) != -1) {
1330 (void) fprintf(stderr
,
1331 "missing argument for '%c' option\n", optopt
);
1335 (void) fprintf(stderr
, "invalid option '%c'\n",
1343 (void) fprintf(stderr
,
1344 "A FILE and multiple SRCs must be specified.\n");
1348 bzero(filename
, MAXPATHLEN
);
1349 strncpy(filename
, argv
[optind
], MAXPATHLEN
- 1);
1352 error
= read_map(filename
, &allcfgs
);
1353 if (error
== ENOENT
) {
1354 allcfgs
= fnvlist_alloc();
1355 } else if (error
!= 0) {
1356 printf("Error read_map(): %s\n", strerror(error
));
1360 while (optind
< argc
) {
1361 char srcfilename
[MAXPATHLEN
];
1364 bzero(srcfilename
, MAXPATHLEN
);
1365 strncpy(srcfilename
, argv
[optind
], MAXPATHLEN
- 1);
1367 error
= draid_merge_impl(allcfgs
, srcfilename
, &merged
);
1369 printf("Error draid_merge_impl(): %s\n",
1371 nvlist_free(allcfgs
);
1375 total_merged
+= merged
;
1376 printf("Merged %d key(s) from '%s' into '%s'\n", merged
,
1377 srcfilename
, filename
);
1382 if (total_merged
> 0)
1383 write_map(filename
, allcfgs
);
1385 printf("Merged a total of %d key(s) into '%s'\n", total_merged
,
1388 nvlist_free(allcfgs
);
1394 main(int argc
, char *argv
[])
1399 char *subcommand
= argv
[1];
1401 if (strcmp(subcommand
, "generate") == 0) {
1402 return (draid_generate(argc
- 1, argv
+ 1));
1403 } else if (strcmp(subcommand
, "verify") == 0) {
1404 return (draid_verify(argc
- 1, argv
+ 1));
1405 } else if (strcmp(subcommand
, "dump") == 0) {
1406 return (draid_dump(argc
- 1, argv
+ 1));
1407 } else if (strcmp(subcommand
, "table") == 0) {
1408 return (draid_table(argc
- 1, argv
+ 1));
1409 } else if (strcmp(subcommand
, "merge") == 0) {
1410 return (draid_merge(argc
- 1, argv
+ 1));