]> git.proxmox.com Git - mirror_zfs.git/blame - tests/zfs-tests/cmd/draid.c
Fix userspace memory leaks found by Clang Static Analzyer
[mirror_zfs.git] / tests / zfs-tests / cmd / draid.c
CommitLineData
b2255edc
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
b2255edc
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2018 Intel Corporation.
23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
24 */
25
26#include <stdio.h>
27#include <zlib.h>
28#include <zfs_fletcher.h>
29#include <sys/vdev_draid.h>
30#include <sys/nvpair.h>
31#include <sys/stat.h>
32
33/*
34 * The number of rows to generate for new permutation maps.
35 */
36#define MAP_ROWS_DEFAULT 256
37
38/*
39 * Key values for dRAID maps when stored as nvlists.
40 */
41#define MAP_SEED "seed"
42#define MAP_CHECKSUM "checksum"
43#define MAP_WORST_RATIO "worst_ratio"
44#define MAP_AVG_RATIO "avg_ratio"
45#define MAP_CHILDREN "children"
46#define MAP_NPERMS "nperms"
47#define MAP_PERMS "perms"
48
49static void
50draid_usage(void)
51{
52 (void) fprintf(stderr,
53 "usage: draid command args ...\n"
54 "Available commands are:\n"
55 "\n"
56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
57 "\tdraid verify [-rv] FILE\n"
58 "\tdraid dump [-v] [-m min] [-n max] FILE\n"
59 "\tdraid table FILE\n"
60 "\tdraid merge FILE SRC SRC...\n");
61 exit(1);
62}
63
64static int
65read_map(const char *filename, nvlist_t **allcfgs)
66{
67 int block_size = 131072;
68 int buf_size = 131072;
69 int tmp_size, error;
70 char *tmp_buf;
71
72 struct stat64 stat;
73 if (lstat64(filename, &stat) != 0)
74 return (errno);
75
76 if (stat.st_size == 0 ||
77 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
78 return (EINVAL);
79 }
80
81 gzFile fp = gzopen(filename, "rb");
82 if (fp == Z_NULL)
83 return (errno);
84
85 char *buf = malloc(buf_size);
86 if (buf == NULL) {
87 (void) gzclose(fp);
88 return (ENOMEM);
89 }
90
91 ssize_t rc, bytes = 0;
92 while (!gzeof(fp)) {
93 rc = gzread(fp, buf + bytes, block_size);
94 if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
95 free(buf);
96 (void) gzclose(fp);
97 (void) gzerror(fp, &error);
98 return (error);
99 } else {
100 bytes += rc;
101
102 if (bytes + block_size >= buf_size) {
103 tmp_size = 2 * buf_size;
104 tmp_buf = malloc(tmp_size);
105 if (tmp_buf == NULL) {
106 free(buf);
107 (void) gzclose(fp);
108 return (ENOMEM);
109 }
110
111 memcpy(tmp_buf, buf, bytes);
112 free(buf);
113 buf = tmp_buf;
114 buf_size = tmp_size;
115 }
116 }
117 }
118
119 (void) gzclose(fp);
120
121 error = nvlist_unpack(buf, bytes, allcfgs, 0);
122 free(buf);
123
124 return (error);
125}
126
127/*
128 * Read a map from the specified filename. A file contains multiple maps
129 * which are indexed by the number of children. The caller is responsible
130 * for freeing the configuration returned.
131 */
132static int
a926aab9 133read_map_key(const char *filename, const char *key, nvlist_t **cfg)
b2255edc
BB
134{
135 nvlist_t *allcfgs, *foundcfg = NULL;
136 int error;
137
138 error = read_map(filename, &allcfgs);
139 if (error != 0)
140 return (error);
141
2a493a4c 142 (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
b2255edc
BB
143 if (foundcfg != NULL) {
144 nvlist_dup(foundcfg, cfg, KM_SLEEP);
145 error = 0;
146 } else {
147 error = ENOENT;
148 }
149
150 nvlist_free(allcfgs);
151
152 return (error);
153}
154
155/*
156 * Write all mappings to the map file.
157 */
158static int
159write_map(const char *filename, nvlist_t *allcfgs)
160{
161 size_t buflen = 0;
162 int error;
163
164 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
165 if (error)
166 return (error);
167
168 char *buf = malloc(buflen);
169 if (buf == NULL)
170 return (ENOMEM);
171
172 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
173 if (error) {
174 free(buf);
175 return (error);
176 }
177
178 /*
179 * Atomically update the file using a temporary file and the
180 * traditional unlink then rename steps. This code provides
181 * no locking, it only guarantees the packed nvlist on disk
182 * is updated atomically and is internally consistent.
183 */
955bf4dc 184 char *tmpname = calloc(1, MAXPATHLEN);
b2255edc
BB
185 if (tmpname == NULL) {
186 free(buf);
187 return (ENOMEM);
188 }
189
190 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
191
192 int fd = mkstemp(tmpname);
193 if (fd < 0) {
194 error = errno;
195 free(buf);
196 free(tmpname);
197 return (error);
198 }
199 (void) close(fd);
200
201 gzFile fp = gzopen(tmpname, "w9b");
202 if (fp == Z_NULL) {
203 error = errno;
204 free(buf);
205 free(tmpname);
206 return (errno);
207 }
208
209 ssize_t rc, bytes = 0;
210 while (bytes < buflen) {
211 size_t size = MIN(buflen - bytes, 131072);
212 rc = gzwrite(fp, buf + bytes, size);
213 if (rc < 0) {
214 free(buf);
215 (void) gzerror(fp, &error);
216 (void) gzclose(fp);
217 (void) unlink(tmpname);
218 free(tmpname);
219 return (error);
220 } else if (rc == 0) {
221 break;
222 } else {
223 bytes += rc;
224 }
225 }
226
227 free(buf);
228 (void) gzclose(fp);
229
230 if (bytes != buflen) {
231 (void) unlink(tmpname);
232 free(tmpname);
233 return (EIO);
234 }
235
236 /*
237 * Unlink the previous config file and replace it with the updated
238 * version. If we're able to unlink the file then directory is
239 * writable by us and the subsequent rename should never fail.
240 */
241 error = unlink(filename);
242 if (error != 0 && errno != ENOENT) {
243 error = errno;
244 (void) unlink(tmpname);
245 free(tmpname);
246 return (error);
247 }
248
249 error = rename(tmpname, filename);
250 if (error != 0) {
251 error = errno;
252 (void) unlink(tmpname);
253 free(tmpname);
254 return (error);
255 }
256
257 free(tmpname);
258
259 return (0);
260}
261
262/*
263 * Add the dRAID map to the file and write it out.
264 */
265static int
266write_map_key(const char *filename, char *key, draid_map_t *map,
267 double worst_ratio, double avg_ratio)
268{
269 nvlist_t *nv_cfg, *allcfgs;
270 int error;
271
272 /*
273 * Add the configuration to an existing or new file. The new
274 * configuration will replace an existing configuration with the
275 * same key if it has a lower ratio and is therefore better.
276 */
277 error = read_map(filename, &allcfgs);
278 if (error == ENOENT) {
279 allcfgs = fnvlist_alloc();
280 } else if (error != 0) {
281 return (error);
282 }
283
284 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
285 if (error == 0) {
286 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
287 MAP_WORST_RATIO);
288 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
289
290 if (worst_ratio < nv_worst_ratio) {
291 /* Replace old map with the more balanced new map. */
292 fnvlist_remove(allcfgs, key);
293 } else {
294 /* The old map is preferable, keep it. */
295 nvlist_free(allcfgs);
296 return (EEXIST);
297 }
298 }
299
300 nvlist_t *cfg = fnvlist_alloc();
301 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
302 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
303 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
304 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
305 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
306 map->dm_children * map->dm_nperms * sizeof (uint8_t));
307
308 fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
309 (uint64_t)(worst_ratio * 1000.0));
310 fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
311 (uint64_t)(avg_ratio * 1000.0));
312
313 error = nvlist_add_nvlist(allcfgs, key, cfg);
314 if (error == 0)
315 error = write_map(filename, allcfgs);
316
317 nvlist_free(cfg);
318 nvlist_free(allcfgs);
319 return (error);
320}
321
322static void
a926aab9
AZ
323dump_map(draid_map_t *map, const char *key, double worst_ratio,
324 double avg_ratio, int verbose)
b2255edc
BB
325{
326 if (verbose == 0) {
327 return;
328 } else if (verbose == 1) {
329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
330 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
331 worst_ratio, avg_ratio);
332 return;
333 } else {
334 printf(" \"%s\":\n"
335 " seed: 0x%016llx\n"
336 " checksum: 0x%016llx\n"
337 " worst_ratio: %2.03f\n"
338 " avg_ratio: %2.03f\n"
339 " children: %llu\n"
340 " nperms: %llu\n",
341 key, (u_longlong_t)map->dm_seed,
342 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
343 (u_longlong_t)map->dm_children,
344 (u_longlong_t)map->dm_nperms);
345
346 if (verbose > 2) {
347 printf(" perms = {\n");
348 for (int i = 0; i < map->dm_nperms; i++) {
349 printf(" { ");
350 for (int j = 0; j < map->dm_children; j++) {
351 printf("%3d%s ", map->dm_perms[
352 i * map->dm_children + j],
353 j < map->dm_children - 1 ?
354 "," : "");
355 }
356 printf(" },\n");
357 }
358 printf(" }\n");
359 } else if (verbose == 2) {
360 printf(" draid_perms = <omitted>\n");
361 }
362 }
363}
364
365static void
a926aab9 366dump_map_nv(const char *key, nvlist_t *cfg, int verbose)
b2255edc
BB
367{
368 draid_map_t map;
369 uint_t c;
370
371 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
372 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
373
374 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
375 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
376 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
377 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
2a493a4c 378 map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c);
b2255edc
BB
379
380 dump_map(&map, key, (double)worst_ratio / 1000.0,
381 avg_ratio / 1000.0, verbose);
382}
383
384/*
385 * Print a summary of the mapping.
386 */
387static int
a926aab9 388dump_map_key(const char *filename, const char *key, int verbose)
b2255edc
BB
389{
390 nvlist_t *cfg;
391 int error;
392
393 error = read_map_key(filename, key, &cfg);
394 if (error != 0)
395 return (error);
396
397 dump_map_nv(key, cfg, verbose);
398
399 return (0);
400}
401
402/*
403 * Allocate a new permutation map for evaluation.
404 */
405static int
406alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
407 draid_map_t **mapp)
408{
409 draid_map_t *map;
410 int error;
411
412 map = malloc(sizeof (draid_map_t));
413 if (map == NULL)
414 return (ENOMEM);
415
416 map->dm_children = children;
417 map->dm_nperms = nperms;
418 map->dm_seed = seed;
419 map->dm_checksum = 0;
420
421 error = vdev_draid_generate_perms(map, &map->dm_perms);
422 if (error) {
423 free(map);
424 return (error);
425 }
426
427 *mapp = map;
428
429 return (0);
430}
431
432/*
433 * Allocate the fixed permutation map for N children.
434 */
435static int
436alloc_fixed_map(uint64_t children, draid_map_t **mapp)
437{
438 const draid_map_t *fixed_map;
439 draid_map_t *map;
440 int error;
441
442 error = vdev_draid_lookup_map(children, &fixed_map);
443 if (error)
444 return (error);
445
446 map = malloc(sizeof (draid_map_t));
447 if (map == NULL)
448 return (ENOMEM);
449
450 memcpy(map, fixed_map, sizeof (draid_map_t));
451 VERIFY3U(map->dm_checksum, !=, 0);
452
453 error = vdev_draid_generate_perms(map, &map->dm_perms);
454 if (error) {
455 free(map);
456 return (error);
457 }
458
459 *mapp = map;
460
461 return (0);
462}
463
464/*
465 * Free a permutation map.
466 */
467static void
468free_map(draid_map_t *map)
469{
470 free(map->dm_perms);
471 free(map);
472}
473
474/*
475 * Check if dev is in the provided list of faulted devices.
476 */
477static inline boolean_t
478is_faulted(int *faulted_devs, int nfaulted, int dev)
479{
480 for (int i = 0; i < nfaulted; i++)
481 if (faulted_devs[i] == dev)
482 return (B_TRUE);
483
484 return (B_FALSE);
485}
486
487/*
488 * Evaluate how resilvering I/O will be distributed given a list of faulted
489 * vdevs. As a simplification we assume one IO is sufficient to repair each
490 * damaged device in a group.
491 */
492static double
493eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
494 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
495{
496 uint64_t children = map->dm_children;
497 uint64_t ngroups = 1;
498 uint64_t ndisks = children - nspares;
499
500 /*
501 * Calculate the minimum number of groups required to fill a slice.
502 */
503 while (ngroups * (groupwidth) % (children - nspares) != 0)
504 ngroups++;
505
506 int *ios = calloc(map->dm_children, sizeof (uint64_t));
507
508 /* Resilver all rows */
509 for (int i = 0; i < map->dm_nperms; i++) {
510 uint8_t *row = &map->dm_perms[i * map->dm_children];
511
512 /* Resilver all groups with faulted drives */
513 for (int j = 0; j < ngroups; j++) {
514 uint64_t spareidx = map->dm_children - nspares;
515 boolean_t repair_needed = B_FALSE;
516
517 /* See if any devices in this group are faulted */
518 uint64_t groupstart = (j * groupwidth) % ndisks;
519
520 for (int k = 0; k < groupwidth; k++) {
521 uint64_t groupidx = (groupstart + k) % ndisks;
522
523 repair_needed = is_faulted(faulted_devs,
524 nfaulted, row[groupidx]);
525 if (repair_needed)
526 break;
527 }
528
529 if (repair_needed == B_FALSE)
530 continue;
531
532 /*
533 * This group is degraded. Calculate the number of
534 * reads the non-faulted drives require and the number
535 * of writes to the distributed hot spare for this row.
536 */
537 for (int k = 0; k < groupwidth; k++) {
538 uint64_t groupidx = (groupstart + k) % ndisks;
539
540 if (!is_faulted(faulted_devs, nfaulted,
541 row[groupidx])) {
542 ios[row[groupidx]]++;
543 } else if (nspares > 0) {
544 while (is_faulted(faulted_devs,
545 nfaulted, row[spareidx])) {
546 spareidx++;
547 }
548
549 ASSERT3U(spareidx, <, map->dm_children);
550 ios[row[spareidx]]++;
551 spareidx++;
552 }
553 }
554 }
555 }
556
557 *min_child_ios = INT_MAX;
558 *max_child_ios = 0;
559
560 /*
561 * Find the drives with fewest and most required I/O. These values
562 * are used to calculate the imbalance ratio. To avoid returning an
563 * infinite value for permutations which have children that perform
564 * no IO a floor of 1 IO per child is set. This ensures a meaningful
565 * ratio is returned for comparison and it is not an uncommon when
566 * there are a large number of children.
567 */
568 for (int i = 0; i < map->dm_children; i++) {
569
570 if (is_faulted(faulted_devs, nfaulted, i)) {
571 ASSERT0(ios[i]);
572 continue;
573 }
574
575 if (ios[i] == 0)
576 ios[i] = 1;
577
578 if (ios[i] < *min_child_ios)
579 *min_child_ios = ios[i];
580
581 if (ios[i] > *max_child_ios)
582 *max_child_ios = ios[i];
583 }
584
585 ASSERT3S(*min_child_ios, !=, INT_MAX);
586 ASSERT3S(*max_child_ios, !=, 0);
587
588 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
589
590 free(ios);
591
592 return (ratio);
593}
594
595/*
596 * Evaluate the quality of the permutation mapping by considering possible
597 * device failures. Returns the imbalance ratio for the worst mapping which
598 * is defined to be the largest number of child IOs over the fewest number
599 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
600 * all children perform an equal amount of work during reconstruction.
601 */
602static void
603eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
604{
605 uint64_t children = map->dm_children;
606 double worst_ratio = 1.0;
607 double sum = 0;
608 int worst_min_ios = 0, worst_max_ios = 0;
609 int n = 0;
610
611 /*
612 * When there are only 2 children there can be no distributed
613 * spare and no resilver to evaluate. Default to a ratio of 1.0
614 * for this degenerate case.
615 */
616 if (children == VDEV_DRAID_MIN_CHILDREN) {
617 *worst_ratiop = 1.0;
618 *avg_ratiop = 1.0;
619 return;
620 }
621
622 /*
623 * Score the mapping as if it had either 1 or 2 distributed spares.
624 */
625 for (int nspares = 1; nspares <= 2; nspares++) {
626 uint64_t faults = nspares;
627
628 /*
bf169e9f 629 * Score groupwidths up to 19. This value was chosen as the
b2255edc
BB
630 * largest reasonable width (16d+3p). dRAID pools may be still
631 * be created with wider stripes but they are not considered in
632 * this analysis in order to optimize for the most common cases.
633 */
634 for (uint64_t groupwidth = 2;
635 groupwidth <= MIN(children - nspares, 19);
636 groupwidth++) {
637 int faulted_devs[2];
638 int min_ios, max_ios;
639
640 /*
641 * Score possible devices faults. This is limited
642 * to exactly one fault per distributed spare for
643 * the purposes of this similation.
644 */
645 for (int f1 = 0; f1 < children; f1++) {
646 faulted_devs[0] = f1;
647 double ratio;
648
649 if (faults == 1) {
650 ratio = eval_resilver(map, groupwidth,
651 nspares, faulted_devs, faults,
652 &min_ios, &max_ios);
653
654 if (ratio > worst_ratio) {
655 worst_ratio = ratio;
656 worst_min_ios = min_ios;
657 worst_max_ios = max_ios;
658 }
659
660 sum += ratio;
661 n++;
662 } else if (faults == 2) {
663 for (int f2 = f1 + 1; f2 < children;
664 f2++) {
665 faulted_devs[1] = f2;
666
667 ratio = eval_resilver(map,
668 groupwidth, nspares,
669 faulted_devs, faults,
670 &min_ios, &max_ios);
671
672 if (ratio > worst_ratio) {
673 worst_ratio = ratio;
674 worst_min_ios = min_ios;
675 worst_max_ios = max_ios;
676 }
677
678 sum += ratio;
679 n++;
680 }
681 }
682 }
683 }
684 }
685
686 *worst_ratiop = worst_ratio;
687 *avg_ratiop = sum / n;
688
689 /*
690 * Log the min/max io values for particularly unbalanced maps.
691 * Since the maps are generated entirely randomly these are possible
692 * be exceedingly unlikely. We log it for possible investigation.
693 */
694 if (worst_ratio > 100.0) {
695 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
696 printf("worst_min_ios=%d worst_max_ios=%d\n",
697 worst_min_ios, worst_max_ios);
698 }
699}
700
701static int
702eval_maps(uint64_t children, int passes, uint64_t *map_seed,
703 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
704{
705 draid_map_t *best_map = NULL;
706 double best_worst_ratio = 1000.0;
707 double best_avg_ratio = 1000.0;
708
709 /*
710 * Perform the requested number of passes evaluating randomly
711 * generated permutation maps. Only the best version is kept.
712 */
713 for (int i = 0; i < passes; i++) {
714 double worst_ratio, avg_ratio;
715 draid_map_t *map;
716 int error;
717
718 /*
719 * Calculate the next seed and generate a new candidate map.
720 */
721 error = alloc_new_map(children, MAP_ROWS_DEFAULT,
722 vdev_draid_rand(map_seed), &map);
f7bda2de
RY
723 if (error) {
724 if (best_map != NULL)
725 free_map(best_map);
b2255edc 726 return (error);
f7bda2de 727 }
b2255edc
BB
728
729 /*
730 * Consider maps with a lower worst_ratio to be of higher
731 * quality. Some maps may have a lower avg_ratio but they
732 * are discarded since they might include some particularly
bf169e9f 733 * imbalanced permutations. The average is tracked to in
b2255edc
BB
734 * order to get a sense of the average permutation quality.
735 */
736 eval_decluster(map, &worst_ratio, &avg_ratio);
737
738 if (best_map == NULL || worst_ratio < best_worst_ratio) {
739
740 if (best_map != NULL)
741 free_map(best_map);
742
743 best_map = map;
744 best_worst_ratio = worst_ratio;
745 best_avg_ratio = avg_ratio;
746 } else {
747 free_map(map);
748 }
749 }
750
751 /*
752 * After determining the best map generate a checksum over the full
753 * permutation array. This checksum is verified when opening a dRAID
754 * pool to ensure the generated in memory permutations are correct.
755 */
756 zio_cksum_t cksum;
757 fletcher_4_native_varsize(best_map->dm_perms,
758 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
759 &cksum);
760 best_map->dm_checksum = cksum.zc_word[0];
761
762 *best_mapp = best_map;
763 *best_ratiop = best_worst_ratio;
764 *avg_ratiop = best_avg_ratio;
765
766 return (0);
767}
768
769static int
770draid_generate(int argc, char *argv[])
771{
861166b0 772 char filename[MAXPATHLEN] = {0};
b2255edc
BB
773 uint64_t map_seed;
774 int c, fd, error, verbose = 0, passes = 1, continuous = 0;
775 int min_children = VDEV_DRAID_MIN_CHILDREN;
776 int max_children = VDEV_DRAID_MAX_CHILDREN;
777 int restarts = 0;
778
779 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
780 switch (c) {
781 case 'c':
782 continuous++;
783 break;
784 case 'm':
785 min_children = (int)strtol(optarg, NULL, 0);
786 if (min_children < VDEV_DRAID_MIN_CHILDREN) {
787 (void) fprintf(stderr, "A minimum of 2 "
788 "children are required.\n");
789 return (1);
790 }
791
792 break;
793 case 'n':
794 max_children = (int)strtol(optarg, NULL, 0);
795 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
796 (void) fprintf(stderr, "A maximum of %d "
797 "children are allowed.\n",
798 VDEV_DRAID_MAX_CHILDREN);
799 return (1);
800 }
801 break;
802 case 'p':
803 passes = (int)strtol(optarg, NULL, 0);
804 break;
805 case 'v':
806 /*
807 * 0 - Only log when a better map is added to the file.
808 * 1 - Log the current best map for each child count.
809 * Minimal output on a single summary line.
810 * 2 - Log the current best map for each child count.
811 * More verbose includes most map fields.
812 * 3 - Log the current best map for each child count.
813 * Very verbose all fields including the full map.
814 */
815 verbose++;
816 break;
817 case ':':
818 (void) fprintf(stderr,
819 "missing argument for '%c' option\n", optopt);
820 draid_usage();
821 break;
822 case '?':
823 (void) fprintf(stderr, "invalid option '%c'\n",
824 optopt);
825 draid_usage();
826 break;
827 }
828 }
829
861166b0 830 if (argc > optind)
b2255edc 831 strncpy(filename, argv[optind], MAXPATHLEN - 1);
861166b0 832 else {
b2255edc
BB
833 (void) fprintf(stderr, "A FILE must be specified.\n");
834 return (1);
835 }
836
837restart:
838 /*
839 * Start with a fresh seed from /dev/urandom.
840 */
841 fd = open("/dev/urandom", O_RDONLY);
842 if (fd < 0) {
843 printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
844 return (1);
845 } else {
846 ssize_t bytes = sizeof (map_seed);
847 ssize_t bytes_read = 0;
848
849 while (bytes_read < bytes) {
850 ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read,
851 bytes - bytes_read);
852 if (rc < 0) {
853 printf("Unable to read /dev/urandom: %s\n:",
854 strerror(errno));
ebe1d036 855 close(fd);
b2255edc
BB
856 return (1);
857 }
858 bytes_read += rc;
859 }
860
861 (void) close(fd);
862 }
863
864 if (restarts == 0)
865 printf("Writing generated mappings to '%s':\n", filename);
866
867 /*
868 * Generate maps for all requested child counts. The best map for
869 * each child count is written out to the specified file. If the file
870 * already contains a better mapping this map will not be added.
871 */
872 for (uint64_t children = min_children;
873 children <= max_children; children++) {
874 char key[8] = { 0 };
875 draid_map_t *map;
876 double worst_ratio = 1000.0;
877 double avg_ratio = 1000.0;
878
879 error = eval_maps(children, passes, &map_seed, &map,
880 &worst_ratio, &avg_ratio);
881 if (error) {
882 printf("Error eval_maps(): %s\n", strerror(error));
883 return (1);
884 }
885
886 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
887 printf("Error ratio < 1.0: worst_ratio = %2.03f "
888 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
889 return (1);
890 }
891
892 snprintf(key, 7, "%llu", (u_longlong_t)children);
893 error = write_map_key(filename, key, map, worst_ratio,
894 avg_ratio);
895 if (error == 0) {
896 /* The new map was added to the file. */
897 dump_map(map, key, worst_ratio, avg_ratio,
898 MAX(verbose, 1));
899 } else if (error == EEXIST) {
900 /* The existing map was preferable and kept. */
901 if (verbose > 0)
902 dump_map_key(filename, key, verbose);
903 } else {
904 printf("Error write_map_key(): %s\n", strerror(error));
905 return (1);
906 }
907
908 free_map(map);
909 }
910
911 /*
912 * When the continuous option is set restart at the minimum number of
913 * children instead of exiting. This option is useful as a mechanism
914 * to continuous try and refine the discovered permutations.
915 */
916 if (continuous) {
917 restarts++;
918 printf("Restarting by request (-c): %d\n", restarts);
919 goto restart;
920 }
921
922 return (0);
923}
924
925/*
926 * Verify each map in the file by generating its in-memory permutation array
927 * and comfirming its checksum is correct.
928 */
929static int
930draid_verify(int argc, char *argv[])
931{
861166b0 932 char filename[MAXPATHLEN] = {0};
b2255edc
BB
933 int n = 0, c, error, verbose = 1;
934 int check_ratios = 0;
935
936 while ((c = getopt(argc, argv, ":rv")) != -1) {
937 switch (c) {
938 case 'r':
939 check_ratios++;
940 break;
941 case 'v':
942 verbose++;
943 break;
944 case ':':
945 (void) fprintf(stderr,
946 "missing argument for '%c' option\n", optopt);
947 draid_usage();
948 break;
949 case '?':
950 (void) fprintf(stderr, "invalid option '%c'\n",
951 optopt);
952 draid_usage();
953 break;
954 }
955 }
956
957 if (argc > optind) {
958 char *abspath = malloc(MAXPATHLEN);
959 if (abspath == NULL)
960 return (ENOMEM);
961
b2255edc
BB
962 if (realpath(argv[optind], abspath) != NULL)
963 strncpy(filename, abspath, MAXPATHLEN - 1);
964 else
965 strncpy(filename, argv[optind], MAXPATHLEN - 1);
966
967 free(abspath);
968 } else {
969 (void) fprintf(stderr, "A FILE must be specified.\n");
970 return (1);
971 }
972
973 printf("Verifying permutation maps: '%s'\n", filename);
974
975 /*
976 * Lookup hardcoded permutation map for each valid number of children
977 * and verify a generated map has the correct checksum. Then compare
978 * the generated map values with the nvlist map values read from the
979 * reference file to cross-check the permutation.
980 */
981 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
982 children <= VDEV_DRAID_MAX_CHILDREN;
983 children++) {
984 draid_map_t *map;
861166b0 985 char key[8] = {0};
b2255edc 986
b2255edc
BB
987 snprintf(key, 8, "%llu", (u_longlong_t)children);
988
989 error = alloc_fixed_map(children, &map);
990 if (error) {
991 printf("Error alloc_fixed_map() failed: %s\n",
992 error == ECKSUM ? "Invalid checksum" :
993 strerror(error));
994 return (1);
995 }
996
997 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
998 uint8_t *nv_perms;
999 nvlist_t *cfg;
1000 uint_t c;
1001
1002 error = read_map_key(filename, key, &cfg);
1003 if (error != 0) {
1004 printf("Error read_map_key() failed: %s\n",
1005 strerror(error));
1006 free_map(map);
1007 return (1);
1008 }
1009
1010 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1011 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1012 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1013 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1014 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1015
1016 /*
1017 * Compare draid_map_t and nvlist reference values.
1018 */
1019 if (map->dm_seed != nv_seed) {
1020 printf("Error different seeds: 0x%016llx != "
1021 "0x%016llx\n", (u_longlong_t)map->dm_seed,
1022 (u_longlong_t)nv_seed);
1023 error = EINVAL;
1024 }
1025
1026 if (map->dm_checksum != nv_checksum) {
1027 printf("Error different checksums: 0x%016llx "
1028 "!= 0x%016llx\n",
1029 (u_longlong_t)map->dm_checksum,
1030 (u_longlong_t)nv_checksum);
1031 error = EINVAL;
1032 }
1033
1034 if (map->dm_children != nv_children) {
1035 printf("Error different children: %llu "
1036 "!= %llu\n", (u_longlong_t)map->dm_children,
1037 (u_longlong_t)nv_children);
1038 error = EINVAL;
1039 }
1040
1041 if (map->dm_nperms != nv_nperms) {
1042 printf("Error different nperms: %llu "
1043 "!= %llu\n", (u_longlong_t)map->dm_nperms,
1044 (u_longlong_t)nv_nperms);
1045 error = EINVAL;
1046 }
1047
1048 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1049 if (map->dm_perms[i] != nv_perms[i]) {
1050 printf("Error different perms[%llu]: "
1051 "%d != %d\n", (u_longlong_t)i,
1052 (int)map->dm_perms[i],
1053 (int)nv_perms[i]);
1054 error = EINVAL;
1055 break;
1056 }
1057 }
1058
1059 /*
1060 * For good measure recalculate the worst and average
1061 * ratios and confirm they match the nvlist values.
1062 */
1063 if (check_ratios) {
1064 uint64_t nv_worst_ratio, nv_avg_ratio;
1065 double worst_ratio, avg_ratio;
1066
1067 eval_decluster(map, &worst_ratio, &avg_ratio);
1068
1069 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1070 MAP_WORST_RATIO);
1071 nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1072 MAP_AVG_RATIO);
1073
1074 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1075 printf("Error ratio out of range %2.03f, "
1076 "%2.03f\n", worst_ratio, avg_ratio);
1077 error = EINVAL;
1078 }
1079
1080 if ((uint64_t)(worst_ratio * 1000.0) !=
1081 nv_worst_ratio) {
1082 printf("Error different worst_ratio %2.03f "
1083 "!= %2.03f\n", (double)nv_worst_ratio /
1084 1000.0, worst_ratio);
1085 error = EINVAL;
1086 }
1087
1088 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1089 printf("Error different average_ratio %2.03f "
1090 "!= %2.03f\n", (double)nv_avg_ratio /
1091 1000.0, avg_ratio);
1092 error = EINVAL;
1093 }
1094 }
1095
1096 if (error) {
1097 free_map(map);
1098 nvlist_free(cfg);
1099 return (1);
1100 }
1101
1102 if (verbose > 0) {
1103 printf("- %llu children: good\n",
1104 (u_longlong_t)children);
1105 }
1106 n++;
1107
1108 free_map(map);
1109 nvlist_free(cfg);
1110 }
1111
1112 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1113 printf("Error permutation maps missing: %d / %d checked\n",
1114 n, VDEV_DRAID_MAX_CHILDREN - 1);
1115 return (1);
1116 }
1117
1118 printf("Successfully verified %d / %d permutation maps\n",
1119 n, VDEV_DRAID_MAX_CHILDREN - 1);
1120
1121 return (0);
1122}
1123
1124/*
1125 * Dump the contents of the specified mapping(s) for inspection.
1126 */
1127static int
1128draid_dump(int argc, char *argv[])
1129{
861166b0 1130 char filename[MAXPATHLEN] = {0};
b2255edc
BB
1131 int c, error, verbose = 1;
1132 int min_children = VDEV_DRAID_MIN_CHILDREN;
1133 int max_children = VDEV_DRAID_MAX_CHILDREN;
1134
1135 while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1136 switch (c) {
1137 case 'm':
1138 min_children = (int)strtol(optarg, NULL, 0);
1139 if (min_children < 2) {
1140 (void) fprintf(stderr, "A minimum of 2 "
1141 "children are required.\n");
1142 return (1);
1143 }
1144
1145 break;
1146 case 'n':
1147 max_children = (int)strtol(optarg, NULL, 0);
1148 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1149 (void) fprintf(stderr, "A maximum of %d "
1150 "children are allowed.\n",
1151 VDEV_DRAID_MAX_CHILDREN);
1152 return (1);
1153 }
1154 break;
1155 case 'v':
1156 verbose++;
1157 break;
1158 case ':':
1159 (void) fprintf(stderr,
1160 "missing argument for '%c' option\n", optopt);
1161 draid_usage();
1162 break;
1163 case '?':
1164 (void) fprintf(stderr, "invalid option '%c'\n",
1165 optopt);
1166 draid_usage();
1167 break;
1168 }
1169 }
1170
861166b0 1171 if (argc > optind)
b2255edc 1172 strncpy(filename, argv[optind], MAXPATHLEN - 1);
861166b0 1173 else {
b2255edc
BB
1174 (void) fprintf(stderr, "A FILE must be specified.\n");
1175 return (1);
1176 }
1177
1178 /*
1179 * Dump maps for the requested child counts.
1180 */
1181 for (uint64_t children = min_children;
1182 children <= max_children; children++) {
1183 char key[8] = { 0 };
1184
1185 snprintf(key, 7, "%llu", (u_longlong_t)children);
1186 error = dump_map_key(filename, key, verbose);
1187 if (error) {
1188 printf("Error dump_map_key(): %s\n", strerror(error));
1189 return (1);
1190 }
1191 }
1192
1193 return (0);
1194}
1195
1196/*
bf169e9f
AG
1197 * Print all of the mappings as a C formatted draid_map_t array. This table
1198 * is found in the module/zcommon/zfs_draid.c file and is the definitive
b2255edc
BB
1199 * source for all mapping used by dRAID. It cannot be updated without
1200 * changing the dRAID on disk format.
1201 */
1202static int
1203draid_table(int argc, char *argv[])
1204{
861166b0 1205 char filename[MAXPATHLEN] = {0};
b2255edc
BB
1206 int error;
1207
861166b0 1208 if (argc > optind)
b2255edc 1209 strncpy(filename, argv[optind], MAXPATHLEN - 1);
861166b0 1210 else {
b2255edc
BB
1211 (void) fprintf(stderr, "A FILE must be specified.\n");
1212 return (1);
1213 }
1214
1215 printf("static const draid_map_t "
1216 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1217
1218 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1219 children <= VDEV_DRAID_MAX_CHILDREN;
1220 children++) {
1221 uint64_t seed, checksum, nperms, avg_ratio;
1222 nvlist_t *cfg;
861166b0 1223 char key[8] = {0};
b2255edc 1224
b2255edc
BB
1225 snprintf(key, 8, "%llu", (u_longlong_t)children);
1226
1227 error = read_map_key(filename, key, &cfg);
1228 if (error != 0) {
1229 printf("Error read_map_key() failed: %s\n",
1230 strerror(error));
1231 return (1);
1232 }
1233
1234 seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1235 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1236 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1237 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1238 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1239
1240 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1241 "/* %2.03f */\n", (u_longlong_t)children,
1242 (u_longlong_t)nperms, (u_longlong_t)seed,
1243 (u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1244
1245 nvlist_free(cfg);
1246 }
1247
1248 printf("};\n");
1249
1250 return (0);
1251}
1252
1253static int
1254draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1255{
1256 nvlist_t *srccfgs;
1257 nvpair_t *elem = NULL;
1258 int error, merged = 0;
1259
1260 error = read_map(srcfilename, &srccfgs);
1261 if (error != 0)
1262 return (error);
1263
1264 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1265 uint64_t nv_worst_ratio;
1266 uint64_t allcfg_worst_ratio;
1267 nvlist_t *cfg, *allcfg;
1268 char *key;
1269
1270 switch (nvpair_type(elem)) {
1271 case DATA_TYPE_NVLIST:
1272
1273 (void) nvpair_value_nvlist(elem, &cfg);
1274 key = nvpair_name(elem);
1275
1276 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1277 MAP_WORST_RATIO);
1278
1279 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1280 if (error == 0) {
1281 allcfg_worst_ratio = fnvlist_lookup_uint64(
1282 allcfg, MAP_WORST_RATIO);
1283
1284 if (nv_worst_ratio < allcfg_worst_ratio) {
1285 fnvlist_remove(allcfgs, key);
1286 error = nvlist_add_nvlist(allcfgs,
1287 key, cfg);
1288 merged++;
1289 }
1290 } else if (error == ENOENT) {
1291 error = nvlist_add_nvlist(allcfgs, key, cfg);
1292 merged++;
1293 } else {
1294 return (error);
1295 }
1296
1297 break;
1298 default:
1299 continue;
1300 }
1301 }
1302
1303 nvlist_free(srccfgs);
1304
1305 *mergedp = merged;
1306
1307 return (0);
1308}
1309
1310/*
1311 * Merge the best map for each child count found in the listed files into
1312 * a new file. This allows 'draid generate' to be run in parallel and for
1313 * the results maps to be combined.
1314 */
1315static int
1316draid_merge(int argc, char *argv[])
1317{
861166b0 1318 char filename[MAXPATHLEN] = {0};
63cb3413 1319 int c, error, total_merged = 0;
b2255edc
BB
1320 nvlist_t *allcfgs;
1321
63cb3413 1322 while ((c = getopt(argc, argv, ":")) != -1) {
b2255edc 1323 switch (c) {
b2255edc
BB
1324 case ':':
1325 (void) fprintf(stderr,
1326 "missing argument for '%c' option\n", optopt);
1327 draid_usage();
1328 break;
1329 case '?':
1330 (void) fprintf(stderr, "invalid option '%c'\n",
1331 optopt);
1332 draid_usage();
1333 break;
1334 }
1335 }
1336
1337 if (argc < 4) {
1338 (void) fprintf(stderr,
1339 "A FILE and multiple SRCs must be specified.\n");
1340 return (1);
1341 }
1342
b2255edc
BB
1343 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1344 optind++;
1345
1346 error = read_map(filename, &allcfgs);
1347 if (error == ENOENT) {
1348 allcfgs = fnvlist_alloc();
1349 } else if (error != 0) {
1350 printf("Error read_map(): %s\n", strerror(error));
1351 return (error);
1352 }
1353
1354 while (optind < argc) {
861166b0 1355 char srcfilename[MAXPATHLEN] = {0};
b2255edc
BB
1356 int merged = 0;
1357
b2255edc
BB
1358 strncpy(srcfilename, argv[optind], MAXPATHLEN - 1);
1359
1360 error = draid_merge_impl(allcfgs, srcfilename, &merged);
1361 if (error) {
1362 printf("Error draid_merge_impl(): %s\n",
1363 strerror(error));
1364 nvlist_free(allcfgs);
1365 return (1);
1366 }
1367
1368 total_merged += merged;
1369 printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1370 srcfilename, filename);
1371
1372 optind++;
1373 }
1374
1375 if (total_merged > 0)
1376 write_map(filename, allcfgs);
1377
1378 printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1379 filename);
1380
1381 nvlist_free(allcfgs);
1382
1383 return (0);
1384}
1385
1386int
1387main(int argc, char *argv[])
1388{
1389 if (argc < 2)
1390 draid_usage();
1391
1392 char *subcommand = argv[1];
1393
1394 if (strcmp(subcommand, "generate") == 0) {
1395 return (draid_generate(argc - 1, argv + 1));
1396 } else if (strcmp(subcommand, "verify") == 0) {
1397 return (draid_verify(argc - 1, argv + 1));
1398 } else if (strcmp(subcommand, "dump") == 0) {
1399 return (draid_dump(argc - 1, argv + 1));
1400 } else if (strcmp(subcommand, "table") == 0) {
1401 return (draid_table(argc - 1, argv + 1));
1402 } else if (strcmp(subcommand, "merge") == 0) {
1403 return (draid_merge(argc - 1, argv + 1));
1404 } else {
1405 draid_usage();
1406 }
1407}