]> git.proxmox.com Git - mirror_zfs.git/blob - tests/zfs-tests/cmd/draid/draid.c
Distributed Spare (dRAID) Feature
[mirror_zfs.git] / tests / zfs-tests / cmd / draid / draid.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2018 Intel Corporation.
23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
24 */
25
26 #include <stdio.h>
27 #include <zlib.h>
28 #include <zfs_fletcher.h>
29 #include <sys/vdev_draid.h>
30 #include <sys/nvpair.h>
31 #include <sys/stat.h>
32
33 /*
34 * The number of rows to generate for new permutation maps.
35 */
36 #define MAP_ROWS_DEFAULT 256
37
38 /*
39 * Key values for dRAID maps when stored as nvlists.
40 */
41 #define MAP_SEED "seed"
42 #define MAP_CHECKSUM "checksum"
43 #define MAP_WORST_RATIO "worst_ratio"
44 #define MAP_AVG_RATIO "avg_ratio"
45 #define MAP_CHILDREN "children"
46 #define MAP_NPERMS "nperms"
47 #define MAP_PERMS "perms"
48
49 static void
50 draid_usage(void)
51 {
52 (void) fprintf(stderr,
53 "usage: draid command args ...\n"
54 "Available commands are:\n"
55 "\n"
56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
57 "\tdraid verify [-rv] FILE\n"
58 "\tdraid dump [-v] [-m min] [-n max] FILE\n"
59 "\tdraid table FILE\n"
60 "\tdraid merge FILE SRC SRC...\n");
61 exit(1);
62 }
63
64 static int
65 read_map(const char *filename, nvlist_t **allcfgs)
66 {
67 int block_size = 131072;
68 int buf_size = 131072;
69 int tmp_size, error;
70 char *tmp_buf;
71
72 struct stat64 stat;
73 if (lstat64(filename, &stat) != 0)
74 return (errno);
75
76 if (stat.st_size == 0 ||
77 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
78 return (EINVAL);
79 }
80
81 gzFile fp = gzopen(filename, "rb");
82 if (fp == Z_NULL)
83 return (errno);
84
85 char *buf = malloc(buf_size);
86 if (buf == NULL) {
87 (void) gzclose(fp);
88 return (ENOMEM);
89 }
90
91 ssize_t rc, bytes = 0;
92 while (!gzeof(fp)) {
93 rc = gzread(fp, buf + bytes, block_size);
94 if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
95 free(buf);
96 (void) gzclose(fp);
97 (void) gzerror(fp, &error);
98 return (error);
99 } else {
100 bytes += rc;
101
102 if (bytes + block_size >= buf_size) {
103 tmp_size = 2 * buf_size;
104 tmp_buf = malloc(tmp_size);
105 if (tmp_buf == NULL) {
106 free(buf);
107 (void) gzclose(fp);
108 return (ENOMEM);
109 }
110
111 memcpy(tmp_buf, buf, bytes);
112 free(buf);
113 buf = tmp_buf;
114 buf_size = tmp_size;
115 }
116 }
117 }
118
119 (void) gzclose(fp);
120
121 error = nvlist_unpack(buf, bytes, allcfgs, 0);
122 free(buf);
123
124 return (error);
125 }
126
127 /*
128 * Read a map from the specified filename. A file contains multiple maps
129 * which are indexed by the number of children. The caller is responsible
130 * for freeing the configuration returned.
131 */
132 static int
133 read_map_key(const char *filename, char *key, nvlist_t **cfg)
134 {
135 nvlist_t *allcfgs, *foundcfg = NULL;
136 int error;
137
138 error = read_map(filename, &allcfgs);
139 if (error != 0)
140 return (error);
141
142 nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
143 if (foundcfg != NULL) {
144 nvlist_dup(foundcfg, cfg, KM_SLEEP);
145 error = 0;
146 } else {
147 error = ENOENT;
148 }
149
150 nvlist_free(allcfgs);
151
152 return (error);
153 }
154
155 /*
156 * Write all mappings to the map file.
157 */
158 static int
159 write_map(const char *filename, nvlist_t *allcfgs)
160 {
161 size_t buflen = 0;
162 int error;
163
164 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
165 if (error)
166 return (error);
167
168 char *buf = malloc(buflen);
169 if (buf == NULL)
170 return (ENOMEM);
171
172 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
173 if (error) {
174 free(buf);
175 return (error);
176 }
177
178 /*
179 * Atomically update the file using a temporary file and the
180 * traditional unlink then rename steps. This code provides
181 * no locking, it only guarantees the packed nvlist on disk
182 * is updated atomically and is internally consistent.
183 */
184 char *tmpname = calloc(MAXPATHLEN, 1);
185 if (tmpname == NULL) {
186 free(buf);
187 return (ENOMEM);
188 }
189
190 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
191
192 int fd = mkstemp(tmpname);
193 if (fd < 0) {
194 error = errno;
195 free(buf);
196 free(tmpname);
197 return (error);
198 }
199 (void) close(fd);
200
201 gzFile fp = gzopen(tmpname, "w9b");
202 if (fp == Z_NULL) {
203 error = errno;
204 free(buf);
205 free(tmpname);
206 return (errno);
207 }
208
209 ssize_t rc, bytes = 0;
210 while (bytes < buflen) {
211 size_t size = MIN(buflen - bytes, 131072);
212 rc = gzwrite(fp, buf + bytes, size);
213 if (rc < 0) {
214 free(buf);
215 (void) gzerror(fp, &error);
216 (void) gzclose(fp);
217 (void) unlink(tmpname);
218 free(tmpname);
219 return (error);
220 } else if (rc == 0) {
221 break;
222 } else {
223 bytes += rc;
224 }
225 }
226
227 free(buf);
228 (void) gzclose(fp);
229
230 if (bytes != buflen) {
231 (void) unlink(tmpname);
232 free(tmpname);
233 return (EIO);
234 }
235
236 /*
237 * Unlink the previous config file and replace it with the updated
238 * version. If we're able to unlink the file then directory is
239 * writable by us and the subsequent rename should never fail.
240 */
241 error = unlink(filename);
242 if (error != 0 && errno != ENOENT) {
243 error = errno;
244 (void) unlink(tmpname);
245 free(tmpname);
246 return (error);
247 }
248
249 error = rename(tmpname, filename);
250 if (error != 0) {
251 error = errno;
252 (void) unlink(tmpname);
253 free(tmpname);
254 return (error);
255 }
256
257 free(tmpname);
258
259 return (0);
260 }
261
262 /*
263 * Add the dRAID map to the file and write it out.
264 */
265 static int
266 write_map_key(const char *filename, char *key, draid_map_t *map,
267 double worst_ratio, double avg_ratio)
268 {
269 nvlist_t *nv_cfg, *allcfgs;
270 int error;
271
272 /*
273 * Add the configuration to an existing or new file. The new
274 * configuration will replace an existing configuration with the
275 * same key if it has a lower ratio and is therefore better.
276 */
277 error = read_map(filename, &allcfgs);
278 if (error == ENOENT) {
279 allcfgs = fnvlist_alloc();
280 } else if (error != 0) {
281 return (error);
282 }
283
284 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
285 if (error == 0) {
286 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
287 MAP_WORST_RATIO);
288 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
289
290 if (worst_ratio < nv_worst_ratio) {
291 /* Replace old map with the more balanced new map. */
292 fnvlist_remove(allcfgs, key);
293 } else {
294 /* The old map is preferable, keep it. */
295 nvlist_free(allcfgs);
296 return (EEXIST);
297 }
298 }
299
300 nvlist_t *cfg = fnvlist_alloc();
301 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
302 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
303 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
304 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
305 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
306 map->dm_children * map->dm_nperms * sizeof (uint8_t));
307
308 fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
309 (uint64_t)(worst_ratio * 1000.0));
310 fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
311 (uint64_t)(avg_ratio * 1000.0));
312
313 error = nvlist_add_nvlist(allcfgs, key, cfg);
314 if (error == 0)
315 error = write_map(filename, allcfgs);
316
317 nvlist_free(cfg);
318 nvlist_free(allcfgs);
319 return (error);
320 }
321
322 static void
323 dump_map(draid_map_t *map, char *key, double worst_ratio, double avg_ratio,
324 int verbose)
325 {
326 if (verbose == 0) {
327 return;
328 } else if (verbose == 1) {
329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
330 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
331 worst_ratio, avg_ratio);
332 return;
333 } else {
334 printf(" \"%s\":\n"
335 " seed: 0x%016llx\n"
336 " checksum: 0x%016llx\n"
337 " worst_ratio: %2.03f\n"
338 " avg_ratio: %2.03f\n"
339 " children: %llu\n"
340 " nperms: %llu\n",
341 key, (u_longlong_t)map->dm_seed,
342 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
343 (u_longlong_t)map->dm_children,
344 (u_longlong_t)map->dm_nperms);
345
346 if (verbose > 2) {
347 printf(" perms = {\n");
348 for (int i = 0; i < map->dm_nperms; i++) {
349 printf(" { ");
350 for (int j = 0; j < map->dm_children; j++) {
351 printf("%3d%s ", map->dm_perms[
352 i * map->dm_children + j],
353 j < map->dm_children - 1 ?
354 "," : "");
355 }
356 printf(" },\n");
357 }
358 printf(" }\n");
359 } else if (verbose == 2) {
360 printf(" draid_perms = <omitted>\n");
361 }
362 }
363 }
364
365 static void
366 dump_map_nv(char *key, nvlist_t *cfg, int verbose)
367 {
368 draid_map_t map;
369 uint_t c;
370
371 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
372 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
373
374 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
375 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
376 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
377 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
378 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c);
379
380 dump_map(&map, key, (double)worst_ratio / 1000.0,
381 avg_ratio / 1000.0, verbose);
382 }
383
384 /*
385 * Print a summary of the mapping.
386 */
387 static int
388 dump_map_key(const char *filename, char *key, int verbose)
389 {
390 nvlist_t *cfg;
391 int error;
392
393 error = read_map_key(filename, key, &cfg);
394 if (error != 0)
395 return (error);
396
397 dump_map_nv(key, cfg, verbose);
398
399 return (0);
400 }
401
402 /*
403 * Allocate a new permutation map for evaluation.
404 */
405 static int
406 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
407 draid_map_t **mapp)
408 {
409 draid_map_t *map;
410 int error;
411
412 map = malloc(sizeof (draid_map_t));
413 if (map == NULL)
414 return (ENOMEM);
415
416 map->dm_children = children;
417 map->dm_nperms = nperms;
418 map->dm_seed = seed;
419 map->dm_checksum = 0;
420
421 error = vdev_draid_generate_perms(map, &map->dm_perms);
422 if (error) {
423 free(map);
424 return (error);
425 }
426
427 *mapp = map;
428
429 return (0);
430 }
431
432 /*
433 * Allocate the fixed permutation map for N children.
434 */
435 static int
436 alloc_fixed_map(uint64_t children, draid_map_t **mapp)
437 {
438 const draid_map_t *fixed_map;
439 draid_map_t *map;
440 int error;
441
442 error = vdev_draid_lookup_map(children, &fixed_map);
443 if (error)
444 return (error);
445
446 map = malloc(sizeof (draid_map_t));
447 if (map == NULL)
448 return (ENOMEM);
449
450 memcpy(map, fixed_map, sizeof (draid_map_t));
451 VERIFY3U(map->dm_checksum, !=, 0);
452
453 error = vdev_draid_generate_perms(map, &map->dm_perms);
454 if (error) {
455 free(map);
456 return (error);
457 }
458
459 *mapp = map;
460
461 return (0);
462 }
463
464 /*
465 * Free a permutation map.
466 */
467 static void
468 free_map(draid_map_t *map)
469 {
470 free(map->dm_perms);
471 free(map);
472 }
473
474 /*
475 * Check if dev is in the provided list of faulted devices.
476 */
477 static inline boolean_t
478 is_faulted(int *faulted_devs, int nfaulted, int dev)
479 {
480 for (int i = 0; i < nfaulted; i++)
481 if (faulted_devs[i] == dev)
482 return (B_TRUE);
483
484 return (B_FALSE);
485 }
486
487 /*
488 * Evaluate how resilvering I/O will be distributed given a list of faulted
489 * vdevs. As a simplification we assume one IO is sufficient to repair each
490 * damaged device in a group.
491 */
492 static double
493 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
494 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
495 {
496 uint64_t children = map->dm_children;
497 uint64_t ngroups = 1;
498 uint64_t ndisks = children - nspares;
499
500 /*
501 * Calculate the minimum number of groups required to fill a slice.
502 */
503 while (ngroups * (groupwidth) % (children - nspares) != 0)
504 ngroups++;
505
506 int *ios = calloc(map->dm_children, sizeof (uint64_t));
507
508 /* Resilver all rows */
509 for (int i = 0; i < map->dm_nperms; i++) {
510 uint8_t *row = &map->dm_perms[i * map->dm_children];
511
512 /* Resilver all groups with faulted drives */
513 for (int j = 0; j < ngroups; j++) {
514 uint64_t spareidx = map->dm_children - nspares;
515 boolean_t repair_needed = B_FALSE;
516
517 /* See if any devices in this group are faulted */
518 uint64_t groupstart = (j * groupwidth) % ndisks;
519
520 for (int k = 0; k < groupwidth; k++) {
521 uint64_t groupidx = (groupstart + k) % ndisks;
522
523 repair_needed = is_faulted(faulted_devs,
524 nfaulted, row[groupidx]);
525 if (repair_needed)
526 break;
527 }
528
529 if (repair_needed == B_FALSE)
530 continue;
531
532 /*
533 * This group is degraded. Calculate the number of
534 * reads the non-faulted drives require and the number
535 * of writes to the distributed hot spare for this row.
536 */
537 for (int k = 0; k < groupwidth; k++) {
538 uint64_t groupidx = (groupstart + k) % ndisks;
539
540 if (!is_faulted(faulted_devs, nfaulted,
541 row[groupidx])) {
542 ios[row[groupidx]]++;
543 } else if (nspares > 0) {
544 while (is_faulted(faulted_devs,
545 nfaulted, row[spareidx])) {
546 spareidx++;
547 }
548
549 ASSERT3U(spareidx, <, map->dm_children);
550 ios[row[spareidx]]++;
551 spareidx++;
552 }
553 }
554 }
555 }
556
557 *min_child_ios = INT_MAX;
558 *max_child_ios = 0;
559
560 /*
561 * Find the drives with fewest and most required I/O. These values
562 * are used to calculate the imbalance ratio. To avoid returning an
563 * infinite value for permutations which have children that perform
564 * no IO a floor of 1 IO per child is set. This ensures a meaningful
565 * ratio is returned for comparison and it is not an uncommon when
566 * there are a large number of children.
567 */
568 for (int i = 0; i < map->dm_children; i++) {
569
570 if (is_faulted(faulted_devs, nfaulted, i)) {
571 ASSERT0(ios[i]);
572 continue;
573 }
574
575 if (ios[i] == 0)
576 ios[i] = 1;
577
578 if (ios[i] < *min_child_ios)
579 *min_child_ios = ios[i];
580
581 if (ios[i] > *max_child_ios)
582 *max_child_ios = ios[i];
583 }
584
585 ASSERT3S(*min_child_ios, !=, INT_MAX);
586 ASSERT3S(*max_child_ios, !=, 0);
587
588 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
589
590 free(ios);
591
592 return (ratio);
593 }
594
595 /*
596 * Evaluate the quality of the permutation mapping by considering possible
597 * device failures. Returns the imbalance ratio for the worst mapping which
598 * is defined to be the largest number of child IOs over the fewest number
599 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
600 * all children perform an equal amount of work during reconstruction.
601 */
602 static void
603 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
604 {
605 uint64_t children = map->dm_children;
606 double worst_ratio = 1.0;
607 double sum = 0;
608 int worst_min_ios = 0, worst_max_ios = 0;
609 int n = 0;
610
611 /*
612 * When there are only 2 children there can be no distributed
613 * spare and no resilver to evaluate. Default to a ratio of 1.0
614 * for this degenerate case.
615 */
616 if (children == VDEV_DRAID_MIN_CHILDREN) {
617 *worst_ratiop = 1.0;
618 *avg_ratiop = 1.0;
619 return;
620 }
621
622 /*
623 * Score the mapping as if it had either 1 or 2 distributed spares.
624 */
625 for (int nspares = 1; nspares <= 2; nspares++) {
626 uint64_t faults = nspares;
627
628 /*
629 * Score groupwidths up to 19. This value was choosen as the
630 * largest reasonable width (16d+3p). dRAID pools may be still
631 * be created with wider stripes but they are not considered in
632 * this analysis in order to optimize for the most common cases.
633 */
634 for (uint64_t groupwidth = 2;
635 groupwidth <= MIN(children - nspares, 19);
636 groupwidth++) {
637 int faulted_devs[2];
638 int min_ios, max_ios;
639
640 /*
641 * Score possible devices faults. This is limited
642 * to exactly one fault per distributed spare for
643 * the purposes of this similation.
644 */
645 for (int f1 = 0; f1 < children; f1++) {
646 faulted_devs[0] = f1;
647 double ratio;
648
649 if (faults == 1) {
650 ratio = eval_resilver(map, groupwidth,
651 nspares, faulted_devs, faults,
652 &min_ios, &max_ios);
653
654 if (ratio > worst_ratio) {
655 worst_ratio = ratio;
656 worst_min_ios = min_ios;
657 worst_max_ios = max_ios;
658 }
659
660 sum += ratio;
661 n++;
662 } else if (faults == 2) {
663 for (int f2 = f1 + 1; f2 < children;
664 f2++) {
665 faulted_devs[1] = f2;
666
667 ratio = eval_resilver(map,
668 groupwidth, nspares,
669 faulted_devs, faults,
670 &min_ios, &max_ios);
671
672 if (ratio > worst_ratio) {
673 worst_ratio = ratio;
674 worst_min_ios = min_ios;
675 worst_max_ios = max_ios;
676 }
677
678 sum += ratio;
679 n++;
680 }
681 }
682 }
683 }
684 }
685
686 *worst_ratiop = worst_ratio;
687 *avg_ratiop = sum / n;
688
689 /*
690 * Log the min/max io values for particularly unbalanced maps.
691 * Since the maps are generated entirely randomly these are possible
692 * be exceedingly unlikely. We log it for possible investigation.
693 */
694 if (worst_ratio > 100.0) {
695 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
696 printf("worst_min_ios=%d worst_max_ios=%d\n",
697 worst_min_ios, worst_max_ios);
698 }
699 }
700
701 static int
702 eval_maps(uint64_t children, int passes, uint64_t *map_seed,
703 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
704 {
705 draid_map_t *best_map = NULL;
706 double best_worst_ratio = 1000.0;
707 double best_avg_ratio = 1000.0;
708
709 /*
710 * Perform the requested number of passes evaluating randomly
711 * generated permutation maps. Only the best version is kept.
712 */
713 for (int i = 0; i < passes; i++) {
714 double worst_ratio, avg_ratio;
715 draid_map_t *map;
716 int error;
717
718 /*
719 * Calculate the next seed and generate a new candidate map.
720 */
721 error = alloc_new_map(children, MAP_ROWS_DEFAULT,
722 vdev_draid_rand(map_seed), &map);
723 if (error)
724 return (error);
725
726 /*
727 * Consider maps with a lower worst_ratio to be of higher
728 * quality. Some maps may have a lower avg_ratio but they
729 * are discarded since they might include some particularly
730 * imbalanced permuations. The average is tracked to in
731 * order to get a sense of the average permutation quality.
732 */
733 eval_decluster(map, &worst_ratio, &avg_ratio);
734
735 if (best_map == NULL || worst_ratio < best_worst_ratio) {
736
737 if (best_map != NULL)
738 free_map(best_map);
739
740 best_map = map;
741 best_worst_ratio = worst_ratio;
742 best_avg_ratio = avg_ratio;
743 } else {
744 free_map(map);
745 }
746 }
747
748 /*
749 * After determining the best map generate a checksum over the full
750 * permutation array. This checksum is verified when opening a dRAID
751 * pool to ensure the generated in memory permutations are correct.
752 */
753 zio_cksum_t cksum;
754 fletcher_4_native_varsize(best_map->dm_perms,
755 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
756 &cksum);
757 best_map->dm_checksum = cksum.zc_word[0];
758
759 *best_mapp = best_map;
760 *best_ratiop = best_worst_ratio;
761 *avg_ratiop = best_avg_ratio;
762
763 return (0);
764 }
765
766 static int
767 draid_generate(int argc, char *argv[])
768 {
769 char filename[MAXPATHLEN];
770 uint64_t map_seed;
771 int c, fd, error, verbose = 0, passes = 1, continuous = 0;
772 int min_children = VDEV_DRAID_MIN_CHILDREN;
773 int max_children = VDEV_DRAID_MAX_CHILDREN;
774 int restarts = 0;
775
776 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
777 switch (c) {
778 case 'c':
779 continuous++;
780 break;
781 case 'm':
782 min_children = (int)strtol(optarg, NULL, 0);
783 if (min_children < VDEV_DRAID_MIN_CHILDREN) {
784 (void) fprintf(stderr, "A minimum of 2 "
785 "children are required.\n");
786 return (1);
787 }
788
789 break;
790 case 'n':
791 max_children = (int)strtol(optarg, NULL, 0);
792 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
793 (void) fprintf(stderr, "A maximum of %d "
794 "children are allowed.\n",
795 VDEV_DRAID_MAX_CHILDREN);
796 return (1);
797 }
798 break;
799 case 'p':
800 passes = (int)strtol(optarg, NULL, 0);
801 break;
802 case 'v':
803 /*
804 * 0 - Only log when a better map is added to the file.
805 * 1 - Log the current best map for each child count.
806 * Minimal output on a single summary line.
807 * 2 - Log the current best map for each child count.
808 * More verbose includes most map fields.
809 * 3 - Log the current best map for each child count.
810 * Very verbose all fields including the full map.
811 */
812 verbose++;
813 break;
814 case ':':
815 (void) fprintf(stderr,
816 "missing argument for '%c' option\n", optopt);
817 draid_usage();
818 break;
819 case '?':
820 (void) fprintf(stderr, "invalid option '%c'\n",
821 optopt);
822 draid_usage();
823 break;
824 }
825 }
826
827 if (argc > optind) {
828 bzero(filename, MAXPATHLEN);
829 strncpy(filename, argv[optind], MAXPATHLEN - 1);
830 } else {
831 (void) fprintf(stderr, "A FILE must be specified.\n");
832 return (1);
833 }
834
835 restart:
836 /*
837 * Start with a fresh seed from /dev/urandom.
838 */
839 fd = open("/dev/urandom", O_RDONLY);
840 if (fd < 0) {
841 printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
842 return (1);
843 } else {
844 ssize_t bytes = sizeof (map_seed);
845 ssize_t bytes_read = 0;
846
847 while (bytes_read < bytes) {
848 ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read,
849 bytes - bytes_read);
850 if (rc < 0) {
851 printf("Unable to read /dev/urandom: %s\n:",
852 strerror(errno));
853 return (1);
854 }
855 bytes_read += rc;
856 }
857
858 (void) close(fd);
859 }
860
861 if (restarts == 0)
862 printf("Writing generated mappings to '%s':\n", filename);
863
864 /*
865 * Generate maps for all requested child counts. The best map for
866 * each child count is written out to the specified file. If the file
867 * already contains a better mapping this map will not be added.
868 */
869 for (uint64_t children = min_children;
870 children <= max_children; children++) {
871 char key[8] = { 0 };
872 draid_map_t *map;
873 double worst_ratio = 1000.0;
874 double avg_ratio = 1000.0;
875
876 error = eval_maps(children, passes, &map_seed, &map,
877 &worst_ratio, &avg_ratio);
878 if (error) {
879 printf("Error eval_maps(): %s\n", strerror(error));
880 return (1);
881 }
882
883 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
884 printf("Error ratio < 1.0: worst_ratio = %2.03f "
885 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
886 return (1);
887 }
888
889 snprintf(key, 7, "%llu", (u_longlong_t)children);
890 error = write_map_key(filename, key, map, worst_ratio,
891 avg_ratio);
892 if (error == 0) {
893 /* The new map was added to the file. */
894 dump_map(map, key, worst_ratio, avg_ratio,
895 MAX(verbose, 1));
896 } else if (error == EEXIST) {
897 /* The existing map was preferable and kept. */
898 if (verbose > 0)
899 dump_map_key(filename, key, verbose);
900 } else {
901 printf("Error write_map_key(): %s\n", strerror(error));
902 return (1);
903 }
904
905 free_map(map);
906 }
907
908 /*
909 * When the continuous option is set restart at the minimum number of
910 * children instead of exiting. This option is useful as a mechanism
911 * to continuous try and refine the discovered permutations.
912 */
913 if (continuous) {
914 restarts++;
915 printf("Restarting by request (-c): %d\n", restarts);
916 goto restart;
917 }
918
919 return (0);
920 }
921
922 /*
923 * Verify each map in the file by generating its in-memory permutation array
924 * and comfirming its checksum is correct.
925 */
926 static int
927 draid_verify(int argc, char *argv[])
928 {
929 char filename[MAXPATHLEN];
930 int n = 0, c, error, verbose = 1;
931 int check_ratios = 0;
932
933 while ((c = getopt(argc, argv, ":rv")) != -1) {
934 switch (c) {
935 case 'r':
936 check_ratios++;
937 break;
938 case 'v':
939 verbose++;
940 break;
941 case ':':
942 (void) fprintf(stderr,
943 "missing argument for '%c' option\n", optopt);
944 draid_usage();
945 break;
946 case '?':
947 (void) fprintf(stderr, "invalid option '%c'\n",
948 optopt);
949 draid_usage();
950 break;
951 }
952 }
953
954 if (argc > optind) {
955 char *abspath = malloc(MAXPATHLEN);
956 if (abspath == NULL)
957 return (ENOMEM);
958
959 bzero(filename, MAXPATHLEN);
960 if (realpath(argv[optind], abspath) != NULL)
961 strncpy(filename, abspath, MAXPATHLEN - 1);
962 else
963 strncpy(filename, argv[optind], MAXPATHLEN - 1);
964
965 free(abspath);
966 } else {
967 (void) fprintf(stderr, "A FILE must be specified.\n");
968 return (1);
969 }
970
971 printf("Verifying permutation maps: '%s'\n", filename);
972
973 /*
974 * Lookup hardcoded permutation map for each valid number of children
975 * and verify a generated map has the correct checksum. Then compare
976 * the generated map values with the nvlist map values read from the
977 * reference file to cross-check the permutation.
978 */
979 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
980 children <= VDEV_DRAID_MAX_CHILDREN;
981 children++) {
982 draid_map_t *map;
983 char key[8];
984
985 bzero(key, 8);
986 snprintf(key, 8, "%llu", (u_longlong_t)children);
987
988 error = alloc_fixed_map(children, &map);
989 if (error) {
990 printf("Error alloc_fixed_map() failed: %s\n",
991 error == ECKSUM ? "Invalid checksum" :
992 strerror(error));
993 return (1);
994 }
995
996 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
997 uint8_t *nv_perms;
998 nvlist_t *cfg;
999 uint_t c;
1000
1001 error = read_map_key(filename, key, &cfg);
1002 if (error != 0) {
1003 printf("Error read_map_key() failed: %s\n",
1004 strerror(error));
1005 free_map(map);
1006 return (1);
1007 }
1008
1009 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1010 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1011 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1012 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1013 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1014
1015 /*
1016 * Compare draid_map_t and nvlist reference values.
1017 */
1018 if (map->dm_seed != nv_seed) {
1019 printf("Error different seeds: 0x%016llx != "
1020 "0x%016llx\n", (u_longlong_t)map->dm_seed,
1021 (u_longlong_t)nv_seed);
1022 error = EINVAL;
1023 }
1024
1025 if (map->dm_checksum != nv_checksum) {
1026 printf("Error different checksums: 0x%016llx "
1027 "!= 0x%016llx\n",
1028 (u_longlong_t)map->dm_checksum,
1029 (u_longlong_t)nv_checksum);
1030 error = EINVAL;
1031 }
1032
1033 if (map->dm_children != nv_children) {
1034 printf("Error different children: %llu "
1035 "!= %llu\n", (u_longlong_t)map->dm_children,
1036 (u_longlong_t)nv_children);
1037 error = EINVAL;
1038 }
1039
1040 if (map->dm_nperms != nv_nperms) {
1041 printf("Error different nperms: %llu "
1042 "!= %llu\n", (u_longlong_t)map->dm_nperms,
1043 (u_longlong_t)nv_nperms);
1044 error = EINVAL;
1045 }
1046
1047 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1048 if (map->dm_perms[i] != nv_perms[i]) {
1049 printf("Error different perms[%llu]: "
1050 "%d != %d\n", (u_longlong_t)i,
1051 (int)map->dm_perms[i],
1052 (int)nv_perms[i]);
1053 error = EINVAL;
1054 break;
1055 }
1056 }
1057
1058 /*
1059 * For good measure recalculate the worst and average
1060 * ratios and confirm they match the nvlist values.
1061 */
1062 if (check_ratios) {
1063 uint64_t nv_worst_ratio, nv_avg_ratio;
1064 double worst_ratio, avg_ratio;
1065
1066 eval_decluster(map, &worst_ratio, &avg_ratio);
1067
1068 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1069 MAP_WORST_RATIO);
1070 nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1071 MAP_AVG_RATIO);
1072
1073 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1074 printf("Error ratio out of range %2.03f, "
1075 "%2.03f\n", worst_ratio, avg_ratio);
1076 error = EINVAL;
1077 }
1078
1079 if ((uint64_t)(worst_ratio * 1000.0) !=
1080 nv_worst_ratio) {
1081 printf("Error different worst_ratio %2.03f "
1082 "!= %2.03f\n", (double)nv_worst_ratio /
1083 1000.0, worst_ratio);
1084 error = EINVAL;
1085 }
1086
1087 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1088 printf("Error different average_ratio %2.03f "
1089 "!= %2.03f\n", (double)nv_avg_ratio /
1090 1000.0, avg_ratio);
1091 error = EINVAL;
1092 }
1093 }
1094
1095 if (error) {
1096 free_map(map);
1097 nvlist_free(cfg);
1098 return (1);
1099 }
1100
1101 if (verbose > 0) {
1102 printf("- %llu children: good\n",
1103 (u_longlong_t)children);
1104 }
1105 n++;
1106
1107 free_map(map);
1108 nvlist_free(cfg);
1109 }
1110
1111 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1112 printf("Error permutation maps missing: %d / %d checked\n",
1113 n, VDEV_DRAID_MAX_CHILDREN - 1);
1114 return (1);
1115 }
1116
1117 printf("Successfully verified %d / %d permutation maps\n",
1118 n, VDEV_DRAID_MAX_CHILDREN - 1);
1119
1120 return (0);
1121 }
1122
1123 /*
1124 * Dump the contents of the specified mapping(s) for inspection.
1125 */
1126 static int
1127 draid_dump(int argc, char *argv[])
1128 {
1129 char filename[MAXPATHLEN];
1130 int c, error, verbose = 1;
1131 int min_children = VDEV_DRAID_MIN_CHILDREN;
1132 int max_children = VDEV_DRAID_MAX_CHILDREN;
1133
1134 while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1135 switch (c) {
1136 case 'm':
1137 min_children = (int)strtol(optarg, NULL, 0);
1138 if (min_children < 2) {
1139 (void) fprintf(stderr, "A minimum of 2 "
1140 "children are required.\n");
1141 return (1);
1142 }
1143
1144 break;
1145 case 'n':
1146 max_children = (int)strtol(optarg, NULL, 0);
1147 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1148 (void) fprintf(stderr, "A maximum of %d "
1149 "children are allowed.\n",
1150 VDEV_DRAID_MAX_CHILDREN);
1151 return (1);
1152 }
1153 break;
1154 case 'v':
1155 verbose++;
1156 break;
1157 case ':':
1158 (void) fprintf(stderr,
1159 "missing argument for '%c' option\n", optopt);
1160 draid_usage();
1161 break;
1162 case '?':
1163 (void) fprintf(stderr, "invalid option '%c'\n",
1164 optopt);
1165 draid_usage();
1166 break;
1167 }
1168 }
1169
1170 if (argc > optind) {
1171 bzero(filename, MAXPATHLEN);
1172 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1173 } else {
1174 (void) fprintf(stderr, "A FILE must be specified.\n");
1175 return (1);
1176 }
1177
1178 /*
1179 * Dump maps for the requested child counts.
1180 */
1181 for (uint64_t children = min_children;
1182 children <= max_children; children++) {
1183 char key[8] = { 0 };
1184
1185 snprintf(key, 7, "%llu", (u_longlong_t)children);
1186 error = dump_map_key(filename, key, verbose);
1187 if (error) {
1188 printf("Error dump_map_key(): %s\n", strerror(error));
1189 return (1);
1190 }
1191 }
1192
1193 return (0);
1194 }
1195
1196 /*
1197 * Print all of the mappings as a C formated draid_map_t array. This table
1198 * is found in the module/zcommon/zfs_draid.c file and is the definative
1199 * source for all mapping used by dRAID. It cannot be updated without
1200 * changing the dRAID on disk format.
1201 */
1202 static int
1203 draid_table(int argc, char *argv[])
1204 {
1205 char filename[MAXPATHLEN];
1206 int error;
1207
1208 if (argc > optind) {
1209 bzero(filename, MAXPATHLEN);
1210 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1211 } else {
1212 (void) fprintf(stderr, "A FILE must be specified.\n");
1213 return (1);
1214 }
1215
1216 printf("static const draid_map_t "
1217 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1218
1219 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1220 children <= VDEV_DRAID_MAX_CHILDREN;
1221 children++) {
1222 uint64_t seed, checksum, nperms, avg_ratio;
1223 nvlist_t *cfg;
1224 char key[8];
1225
1226 bzero(key, 8);
1227 snprintf(key, 8, "%llu", (u_longlong_t)children);
1228
1229 error = read_map_key(filename, key, &cfg);
1230 if (error != 0) {
1231 printf("Error read_map_key() failed: %s\n",
1232 strerror(error));
1233 return (1);
1234 }
1235
1236 seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1237 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1238 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1239 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1240 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1241
1242 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1243 "/* %2.03f */\n", (u_longlong_t)children,
1244 (u_longlong_t)nperms, (u_longlong_t)seed,
1245 (u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1246
1247 nvlist_free(cfg);
1248 }
1249
1250 printf("};\n");
1251
1252 return (0);
1253 }
1254
1255 static int
1256 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1257 {
1258 nvlist_t *srccfgs;
1259 nvpair_t *elem = NULL;
1260 int error, merged = 0;
1261
1262 error = read_map(srcfilename, &srccfgs);
1263 if (error != 0)
1264 return (error);
1265
1266 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1267 uint64_t nv_worst_ratio;
1268 uint64_t allcfg_worst_ratio;
1269 nvlist_t *cfg, *allcfg;
1270 char *key;
1271
1272 switch (nvpair_type(elem)) {
1273 case DATA_TYPE_NVLIST:
1274
1275 (void) nvpair_value_nvlist(elem, &cfg);
1276 key = nvpair_name(elem);
1277
1278 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1279 MAP_WORST_RATIO);
1280
1281 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1282 if (error == 0) {
1283 allcfg_worst_ratio = fnvlist_lookup_uint64(
1284 allcfg, MAP_WORST_RATIO);
1285
1286 if (nv_worst_ratio < allcfg_worst_ratio) {
1287 fnvlist_remove(allcfgs, key);
1288 error = nvlist_add_nvlist(allcfgs,
1289 key, cfg);
1290 merged++;
1291 }
1292 } else if (error == ENOENT) {
1293 error = nvlist_add_nvlist(allcfgs, key, cfg);
1294 merged++;
1295 } else {
1296 return (error);
1297 }
1298
1299 break;
1300 default:
1301 continue;
1302 }
1303 }
1304
1305 nvlist_free(srccfgs);
1306
1307 *mergedp = merged;
1308
1309 return (0);
1310 }
1311
1312 /*
1313 * Merge the best map for each child count found in the listed files into
1314 * a new file. This allows 'draid generate' to be run in parallel and for
1315 * the results maps to be combined.
1316 */
1317 static int
1318 draid_merge(int argc, char *argv[])
1319 {
1320 char filename[MAXPATHLEN];
1321 int c, error, total_merged = 0, verbose = 0;
1322 nvlist_t *allcfgs;
1323
1324 while ((c = getopt(argc, argv, ":v")) != -1) {
1325 switch (c) {
1326 case 'v':
1327 verbose++;
1328 break;
1329 case ':':
1330 (void) fprintf(stderr,
1331 "missing argument for '%c' option\n", optopt);
1332 draid_usage();
1333 break;
1334 case '?':
1335 (void) fprintf(stderr, "invalid option '%c'\n",
1336 optopt);
1337 draid_usage();
1338 break;
1339 }
1340 }
1341
1342 if (argc < 4) {
1343 (void) fprintf(stderr,
1344 "A FILE and multiple SRCs must be specified.\n");
1345 return (1);
1346 }
1347
1348 bzero(filename, MAXPATHLEN);
1349 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1350 optind++;
1351
1352 error = read_map(filename, &allcfgs);
1353 if (error == ENOENT) {
1354 allcfgs = fnvlist_alloc();
1355 } else if (error != 0) {
1356 printf("Error read_map(): %s\n", strerror(error));
1357 return (error);
1358 }
1359
1360 while (optind < argc) {
1361 char srcfilename[MAXPATHLEN];
1362 int merged = 0;
1363
1364 bzero(srcfilename, MAXPATHLEN);
1365 strncpy(srcfilename, argv[optind], MAXPATHLEN - 1);
1366
1367 error = draid_merge_impl(allcfgs, srcfilename, &merged);
1368 if (error) {
1369 printf("Error draid_merge_impl(): %s\n",
1370 strerror(error));
1371 nvlist_free(allcfgs);
1372 return (1);
1373 }
1374
1375 total_merged += merged;
1376 printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1377 srcfilename, filename);
1378
1379 optind++;
1380 }
1381
1382 if (total_merged > 0)
1383 write_map(filename, allcfgs);
1384
1385 printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1386 filename);
1387
1388 nvlist_free(allcfgs);
1389
1390 return (0);
1391 }
1392
1393 int
1394 main(int argc, char *argv[])
1395 {
1396 if (argc < 2)
1397 draid_usage();
1398
1399 char *subcommand = argv[1];
1400
1401 if (strcmp(subcommand, "generate") == 0) {
1402 return (draid_generate(argc - 1, argv + 1));
1403 } else if (strcmp(subcommand, "verify") == 0) {
1404 return (draid_verify(argc - 1, argv + 1));
1405 } else if (strcmp(subcommand, "dump") == 0) {
1406 return (draid_dump(argc - 1, argv + 1));
1407 } else if (strcmp(subcommand, "table") == 0) {
1408 return (draid_table(argc - 1, argv + 1));
1409 } else if (strcmp(subcommand, "merge") == 0) {
1410 return (draid_merge(argc - 1, argv + 1));
1411 } else {
1412 draid_usage();
1413 }
1414 }