]> git.proxmox.com Git - mirror_zfs.git/blob - tests/zfs-tests/cmd/draid/draid.c
Remove bcopy(), bzero(), bcmp()
[mirror_zfs.git] / tests / zfs-tests / cmd / draid / draid.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2018 Intel Corporation.
23 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
24 */
25
26 #include <stdio.h>
27 #include <zlib.h>
28 #include <zfs_fletcher.h>
29 #include <sys/vdev_draid.h>
30 #include <sys/nvpair.h>
31 #include <sys/stat.h>
32
33 /*
34 * The number of rows to generate for new permutation maps.
35 */
36 #define MAP_ROWS_DEFAULT 256
37
38 /*
39 * Key values for dRAID maps when stored as nvlists.
40 */
41 #define MAP_SEED "seed"
42 #define MAP_CHECKSUM "checksum"
43 #define MAP_WORST_RATIO "worst_ratio"
44 #define MAP_AVG_RATIO "avg_ratio"
45 #define MAP_CHILDREN "children"
46 #define MAP_NPERMS "nperms"
47 #define MAP_PERMS "perms"
48
49 static void
50 draid_usage(void)
51 {
52 (void) fprintf(stderr,
53 "usage: draid command args ...\n"
54 "Available commands are:\n"
55 "\n"
56 "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
57 "\tdraid verify [-rv] FILE\n"
58 "\tdraid dump [-v] [-m min] [-n max] FILE\n"
59 "\tdraid table FILE\n"
60 "\tdraid merge FILE SRC SRC...\n");
61 exit(1);
62 }
63
64 static int
65 read_map(const char *filename, nvlist_t **allcfgs)
66 {
67 int block_size = 131072;
68 int buf_size = 131072;
69 int tmp_size, error;
70 char *tmp_buf;
71
72 struct stat64 stat;
73 if (lstat64(filename, &stat) != 0)
74 return (errno);
75
76 if (stat.st_size == 0 ||
77 !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
78 return (EINVAL);
79 }
80
81 gzFile fp = gzopen(filename, "rb");
82 if (fp == Z_NULL)
83 return (errno);
84
85 char *buf = malloc(buf_size);
86 if (buf == NULL) {
87 (void) gzclose(fp);
88 return (ENOMEM);
89 }
90
91 ssize_t rc, bytes = 0;
92 while (!gzeof(fp)) {
93 rc = gzread(fp, buf + bytes, block_size);
94 if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
95 free(buf);
96 (void) gzclose(fp);
97 (void) gzerror(fp, &error);
98 return (error);
99 } else {
100 bytes += rc;
101
102 if (bytes + block_size >= buf_size) {
103 tmp_size = 2 * buf_size;
104 tmp_buf = malloc(tmp_size);
105 if (tmp_buf == NULL) {
106 free(buf);
107 (void) gzclose(fp);
108 return (ENOMEM);
109 }
110
111 memcpy(tmp_buf, buf, bytes);
112 free(buf);
113 buf = tmp_buf;
114 buf_size = tmp_size;
115 }
116 }
117 }
118
119 (void) gzclose(fp);
120
121 error = nvlist_unpack(buf, bytes, allcfgs, 0);
122 free(buf);
123
124 return (error);
125 }
126
127 /*
128 * Read a map from the specified filename. A file contains multiple maps
129 * which are indexed by the number of children. The caller is responsible
130 * for freeing the configuration returned.
131 */
132 static int
133 read_map_key(const char *filename, char *key, nvlist_t **cfg)
134 {
135 nvlist_t *allcfgs, *foundcfg = NULL;
136 int error;
137
138 error = read_map(filename, &allcfgs);
139 if (error != 0)
140 return (error);
141
142 nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
143 if (foundcfg != NULL) {
144 nvlist_dup(foundcfg, cfg, KM_SLEEP);
145 error = 0;
146 } else {
147 error = ENOENT;
148 }
149
150 nvlist_free(allcfgs);
151
152 return (error);
153 }
154
155 /*
156 * Write all mappings to the map file.
157 */
158 static int
159 write_map(const char *filename, nvlist_t *allcfgs)
160 {
161 size_t buflen = 0;
162 int error;
163
164 error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
165 if (error)
166 return (error);
167
168 char *buf = malloc(buflen);
169 if (buf == NULL)
170 return (ENOMEM);
171
172 error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
173 if (error) {
174 free(buf);
175 return (error);
176 }
177
178 /*
179 * Atomically update the file using a temporary file and the
180 * traditional unlink then rename steps. This code provides
181 * no locking, it only guarantees the packed nvlist on disk
182 * is updated atomically and is internally consistent.
183 */
184 char *tmpname = calloc(1, MAXPATHLEN);
185 if (tmpname == NULL) {
186 free(buf);
187 return (ENOMEM);
188 }
189
190 snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
191
192 int fd = mkstemp(tmpname);
193 if (fd < 0) {
194 error = errno;
195 free(buf);
196 free(tmpname);
197 return (error);
198 }
199 (void) close(fd);
200
201 gzFile fp = gzopen(tmpname, "w9b");
202 if (fp == Z_NULL) {
203 error = errno;
204 free(buf);
205 free(tmpname);
206 return (errno);
207 }
208
209 ssize_t rc, bytes = 0;
210 while (bytes < buflen) {
211 size_t size = MIN(buflen - bytes, 131072);
212 rc = gzwrite(fp, buf + bytes, size);
213 if (rc < 0) {
214 free(buf);
215 (void) gzerror(fp, &error);
216 (void) gzclose(fp);
217 (void) unlink(tmpname);
218 free(tmpname);
219 return (error);
220 } else if (rc == 0) {
221 break;
222 } else {
223 bytes += rc;
224 }
225 }
226
227 free(buf);
228 (void) gzclose(fp);
229
230 if (bytes != buflen) {
231 (void) unlink(tmpname);
232 free(tmpname);
233 return (EIO);
234 }
235
236 /*
237 * Unlink the previous config file and replace it with the updated
238 * version. If we're able to unlink the file then directory is
239 * writable by us and the subsequent rename should never fail.
240 */
241 error = unlink(filename);
242 if (error != 0 && errno != ENOENT) {
243 error = errno;
244 (void) unlink(tmpname);
245 free(tmpname);
246 return (error);
247 }
248
249 error = rename(tmpname, filename);
250 if (error != 0) {
251 error = errno;
252 (void) unlink(tmpname);
253 free(tmpname);
254 return (error);
255 }
256
257 free(tmpname);
258
259 return (0);
260 }
261
262 /*
263 * Add the dRAID map to the file and write it out.
264 */
265 static int
266 write_map_key(const char *filename, char *key, draid_map_t *map,
267 double worst_ratio, double avg_ratio)
268 {
269 nvlist_t *nv_cfg, *allcfgs;
270 int error;
271
272 /*
273 * Add the configuration to an existing or new file. The new
274 * configuration will replace an existing configuration with the
275 * same key if it has a lower ratio and is therefore better.
276 */
277 error = read_map(filename, &allcfgs);
278 if (error == ENOENT) {
279 allcfgs = fnvlist_alloc();
280 } else if (error != 0) {
281 return (error);
282 }
283
284 error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
285 if (error == 0) {
286 uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
287 MAP_WORST_RATIO);
288 double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
289
290 if (worst_ratio < nv_worst_ratio) {
291 /* Replace old map with the more balanced new map. */
292 fnvlist_remove(allcfgs, key);
293 } else {
294 /* The old map is preferable, keep it. */
295 nvlist_free(allcfgs);
296 return (EEXIST);
297 }
298 }
299
300 nvlist_t *cfg = fnvlist_alloc();
301 fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
302 fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
303 fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
304 fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
305 fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
306 map->dm_children * map->dm_nperms * sizeof (uint8_t));
307
308 fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
309 (uint64_t)(worst_ratio * 1000.0));
310 fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
311 (uint64_t)(avg_ratio * 1000.0));
312
313 error = nvlist_add_nvlist(allcfgs, key, cfg);
314 if (error == 0)
315 error = write_map(filename, allcfgs);
316
317 nvlist_free(cfg);
318 nvlist_free(allcfgs);
319 return (error);
320 }
321
322 static void
323 dump_map(draid_map_t *map, char *key, double worst_ratio, double avg_ratio,
324 int verbose)
325 {
326 if (verbose == 0) {
327 return;
328 } else if (verbose == 1) {
329 printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
330 "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
331 worst_ratio, avg_ratio);
332 return;
333 } else {
334 printf(" \"%s\":\n"
335 " seed: 0x%016llx\n"
336 " checksum: 0x%016llx\n"
337 " worst_ratio: %2.03f\n"
338 " avg_ratio: %2.03f\n"
339 " children: %llu\n"
340 " nperms: %llu\n",
341 key, (u_longlong_t)map->dm_seed,
342 (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
343 (u_longlong_t)map->dm_children,
344 (u_longlong_t)map->dm_nperms);
345
346 if (verbose > 2) {
347 printf(" perms = {\n");
348 for (int i = 0; i < map->dm_nperms; i++) {
349 printf(" { ");
350 for (int j = 0; j < map->dm_children; j++) {
351 printf("%3d%s ", map->dm_perms[
352 i * map->dm_children + j],
353 j < map->dm_children - 1 ?
354 "," : "");
355 }
356 printf(" },\n");
357 }
358 printf(" }\n");
359 } else if (verbose == 2) {
360 printf(" draid_perms = <omitted>\n");
361 }
362 }
363 }
364
365 static void
366 dump_map_nv(char *key, nvlist_t *cfg, int verbose)
367 {
368 draid_map_t map;
369 uint_t c;
370
371 uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
372 uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
373
374 map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
375 map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
376 map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
377 map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
378 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c);
379
380 dump_map(&map, key, (double)worst_ratio / 1000.0,
381 avg_ratio / 1000.0, verbose);
382 }
383
384 /*
385 * Print a summary of the mapping.
386 */
387 static int
388 dump_map_key(const char *filename, char *key, int verbose)
389 {
390 nvlist_t *cfg;
391 int error;
392
393 error = read_map_key(filename, key, &cfg);
394 if (error != 0)
395 return (error);
396
397 dump_map_nv(key, cfg, verbose);
398
399 return (0);
400 }
401
402 /*
403 * Allocate a new permutation map for evaluation.
404 */
405 static int
406 alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
407 draid_map_t **mapp)
408 {
409 draid_map_t *map;
410 int error;
411
412 map = malloc(sizeof (draid_map_t));
413 if (map == NULL)
414 return (ENOMEM);
415
416 map->dm_children = children;
417 map->dm_nperms = nperms;
418 map->dm_seed = seed;
419 map->dm_checksum = 0;
420
421 error = vdev_draid_generate_perms(map, &map->dm_perms);
422 if (error) {
423 free(map);
424 return (error);
425 }
426
427 *mapp = map;
428
429 return (0);
430 }
431
432 /*
433 * Allocate the fixed permutation map for N children.
434 */
435 static int
436 alloc_fixed_map(uint64_t children, draid_map_t **mapp)
437 {
438 const draid_map_t *fixed_map;
439 draid_map_t *map;
440 int error;
441
442 error = vdev_draid_lookup_map(children, &fixed_map);
443 if (error)
444 return (error);
445
446 map = malloc(sizeof (draid_map_t));
447 if (map == NULL)
448 return (ENOMEM);
449
450 memcpy(map, fixed_map, sizeof (draid_map_t));
451 VERIFY3U(map->dm_checksum, !=, 0);
452
453 error = vdev_draid_generate_perms(map, &map->dm_perms);
454 if (error) {
455 free(map);
456 return (error);
457 }
458
459 *mapp = map;
460
461 return (0);
462 }
463
464 /*
465 * Free a permutation map.
466 */
467 static void
468 free_map(draid_map_t *map)
469 {
470 free(map->dm_perms);
471 free(map);
472 }
473
474 /*
475 * Check if dev is in the provided list of faulted devices.
476 */
477 static inline boolean_t
478 is_faulted(int *faulted_devs, int nfaulted, int dev)
479 {
480 for (int i = 0; i < nfaulted; i++)
481 if (faulted_devs[i] == dev)
482 return (B_TRUE);
483
484 return (B_FALSE);
485 }
486
487 /*
488 * Evaluate how resilvering I/O will be distributed given a list of faulted
489 * vdevs. As a simplification we assume one IO is sufficient to repair each
490 * damaged device in a group.
491 */
492 static double
493 eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
494 int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
495 {
496 uint64_t children = map->dm_children;
497 uint64_t ngroups = 1;
498 uint64_t ndisks = children - nspares;
499
500 /*
501 * Calculate the minimum number of groups required to fill a slice.
502 */
503 while (ngroups * (groupwidth) % (children - nspares) != 0)
504 ngroups++;
505
506 int *ios = calloc(map->dm_children, sizeof (uint64_t));
507
508 /* Resilver all rows */
509 for (int i = 0; i < map->dm_nperms; i++) {
510 uint8_t *row = &map->dm_perms[i * map->dm_children];
511
512 /* Resilver all groups with faulted drives */
513 for (int j = 0; j < ngroups; j++) {
514 uint64_t spareidx = map->dm_children - nspares;
515 boolean_t repair_needed = B_FALSE;
516
517 /* See if any devices in this group are faulted */
518 uint64_t groupstart = (j * groupwidth) % ndisks;
519
520 for (int k = 0; k < groupwidth; k++) {
521 uint64_t groupidx = (groupstart + k) % ndisks;
522
523 repair_needed = is_faulted(faulted_devs,
524 nfaulted, row[groupidx]);
525 if (repair_needed)
526 break;
527 }
528
529 if (repair_needed == B_FALSE)
530 continue;
531
532 /*
533 * This group is degraded. Calculate the number of
534 * reads the non-faulted drives require and the number
535 * of writes to the distributed hot spare for this row.
536 */
537 for (int k = 0; k < groupwidth; k++) {
538 uint64_t groupidx = (groupstart + k) % ndisks;
539
540 if (!is_faulted(faulted_devs, nfaulted,
541 row[groupidx])) {
542 ios[row[groupidx]]++;
543 } else if (nspares > 0) {
544 while (is_faulted(faulted_devs,
545 nfaulted, row[spareidx])) {
546 spareidx++;
547 }
548
549 ASSERT3U(spareidx, <, map->dm_children);
550 ios[row[spareidx]]++;
551 spareidx++;
552 }
553 }
554 }
555 }
556
557 *min_child_ios = INT_MAX;
558 *max_child_ios = 0;
559
560 /*
561 * Find the drives with fewest and most required I/O. These values
562 * are used to calculate the imbalance ratio. To avoid returning an
563 * infinite value for permutations which have children that perform
564 * no IO a floor of 1 IO per child is set. This ensures a meaningful
565 * ratio is returned for comparison and it is not an uncommon when
566 * there are a large number of children.
567 */
568 for (int i = 0; i < map->dm_children; i++) {
569
570 if (is_faulted(faulted_devs, nfaulted, i)) {
571 ASSERT0(ios[i]);
572 continue;
573 }
574
575 if (ios[i] == 0)
576 ios[i] = 1;
577
578 if (ios[i] < *min_child_ios)
579 *min_child_ios = ios[i];
580
581 if (ios[i] > *max_child_ios)
582 *max_child_ios = ios[i];
583 }
584
585 ASSERT3S(*min_child_ios, !=, INT_MAX);
586 ASSERT3S(*max_child_ios, !=, 0);
587
588 double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
589
590 free(ios);
591
592 return (ratio);
593 }
594
595 /*
596 * Evaluate the quality of the permutation mapping by considering possible
597 * device failures. Returns the imbalance ratio for the worst mapping which
598 * is defined to be the largest number of child IOs over the fewest number
599 * child IOs. A value of 1.0 indicates the mapping is perfectly balance and
600 * all children perform an equal amount of work during reconstruction.
601 */
602 static void
603 eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
604 {
605 uint64_t children = map->dm_children;
606 double worst_ratio = 1.0;
607 double sum = 0;
608 int worst_min_ios = 0, worst_max_ios = 0;
609 int n = 0;
610
611 /*
612 * When there are only 2 children there can be no distributed
613 * spare and no resilver to evaluate. Default to a ratio of 1.0
614 * for this degenerate case.
615 */
616 if (children == VDEV_DRAID_MIN_CHILDREN) {
617 *worst_ratiop = 1.0;
618 *avg_ratiop = 1.0;
619 return;
620 }
621
622 /*
623 * Score the mapping as if it had either 1 or 2 distributed spares.
624 */
625 for (int nspares = 1; nspares <= 2; nspares++) {
626 uint64_t faults = nspares;
627
628 /*
629 * Score groupwidths up to 19. This value was chosen as the
630 * largest reasonable width (16d+3p). dRAID pools may be still
631 * be created with wider stripes but they are not considered in
632 * this analysis in order to optimize for the most common cases.
633 */
634 for (uint64_t groupwidth = 2;
635 groupwidth <= MIN(children - nspares, 19);
636 groupwidth++) {
637 int faulted_devs[2];
638 int min_ios, max_ios;
639
640 /*
641 * Score possible devices faults. This is limited
642 * to exactly one fault per distributed spare for
643 * the purposes of this similation.
644 */
645 for (int f1 = 0; f1 < children; f1++) {
646 faulted_devs[0] = f1;
647 double ratio;
648
649 if (faults == 1) {
650 ratio = eval_resilver(map, groupwidth,
651 nspares, faulted_devs, faults,
652 &min_ios, &max_ios);
653
654 if (ratio > worst_ratio) {
655 worst_ratio = ratio;
656 worst_min_ios = min_ios;
657 worst_max_ios = max_ios;
658 }
659
660 sum += ratio;
661 n++;
662 } else if (faults == 2) {
663 for (int f2 = f1 + 1; f2 < children;
664 f2++) {
665 faulted_devs[1] = f2;
666
667 ratio = eval_resilver(map,
668 groupwidth, nspares,
669 faulted_devs, faults,
670 &min_ios, &max_ios);
671
672 if (ratio > worst_ratio) {
673 worst_ratio = ratio;
674 worst_min_ios = min_ios;
675 worst_max_ios = max_ios;
676 }
677
678 sum += ratio;
679 n++;
680 }
681 }
682 }
683 }
684 }
685
686 *worst_ratiop = worst_ratio;
687 *avg_ratiop = sum / n;
688
689 /*
690 * Log the min/max io values for particularly unbalanced maps.
691 * Since the maps are generated entirely randomly these are possible
692 * be exceedingly unlikely. We log it for possible investigation.
693 */
694 if (worst_ratio > 100.0) {
695 dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
696 printf("worst_min_ios=%d worst_max_ios=%d\n",
697 worst_min_ios, worst_max_ios);
698 }
699 }
700
701 static int
702 eval_maps(uint64_t children, int passes, uint64_t *map_seed,
703 draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
704 {
705 draid_map_t *best_map = NULL;
706 double best_worst_ratio = 1000.0;
707 double best_avg_ratio = 1000.0;
708
709 /*
710 * Perform the requested number of passes evaluating randomly
711 * generated permutation maps. Only the best version is kept.
712 */
713 for (int i = 0; i < passes; i++) {
714 double worst_ratio, avg_ratio;
715 draid_map_t *map;
716 int error;
717
718 /*
719 * Calculate the next seed and generate a new candidate map.
720 */
721 error = alloc_new_map(children, MAP_ROWS_DEFAULT,
722 vdev_draid_rand(map_seed), &map);
723 if (error)
724 return (error);
725
726 /*
727 * Consider maps with a lower worst_ratio to be of higher
728 * quality. Some maps may have a lower avg_ratio but they
729 * are discarded since they might include some particularly
730 * imbalanced permutations. The average is tracked to in
731 * order to get a sense of the average permutation quality.
732 */
733 eval_decluster(map, &worst_ratio, &avg_ratio);
734
735 if (best_map == NULL || worst_ratio < best_worst_ratio) {
736
737 if (best_map != NULL)
738 free_map(best_map);
739
740 best_map = map;
741 best_worst_ratio = worst_ratio;
742 best_avg_ratio = avg_ratio;
743 } else {
744 free_map(map);
745 }
746 }
747
748 /*
749 * After determining the best map generate a checksum over the full
750 * permutation array. This checksum is verified when opening a dRAID
751 * pool to ensure the generated in memory permutations are correct.
752 */
753 zio_cksum_t cksum;
754 fletcher_4_native_varsize(best_map->dm_perms,
755 sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
756 &cksum);
757 best_map->dm_checksum = cksum.zc_word[0];
758
759 *best_mapp = best_map;
760 *best_ratiop = best_worst_ratio;
761 *avg_ratiop = best_avg_ratio;
762
763 return (0);
764 }
765
766 static int
767 draid_generate(int argc, char *argv[])
768 {
769 char filename[MAXPATHLEN] = {0};
770 uint64_t map_seed;
771 int c, fd, error, verbose = 0, passes = 1, continuous = 0;
772 int min_children = VDEV_DRAID_MIN_CHILDREN;
773 int max_children = VDEV_DRAID_MAX_CHILDREN;
774 int restarts = 0;
775
776 while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
777 switch (c) {
778 case 'c':
779 continuous++;
780 break;
781 case 'm':
782 min_children = (int)strtol(optarg, NULL, 0);
783 if (min_children < VDEV_DRAID_MIN_CHILDREN) {
784 (void) fprintf(stderr, "A minimum of 2 "
785 "children are required.\n");
786 return (1);
787 }
788
789 break;
790 case 'n':
791 max_children = (int)strtol(optarg, NULL, 0);
792 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
793 (void) fprintf(stderr, "A maximum of %d "
794 "children are allowed.\n",
795 VDEV_DRAID_MAX_CHILDREN);
796 return (1);
797 }
798 break;
799 case 'p':
800 passes = (int)strtol(optarg, NULL, 0);
801 break;
802 case 'v':
803 /*
804 * 0 - Only log when a better map is added to the file.
805 * 1 - Log the current best map for each child count.
806 * Minimal output on a single summary line.
807 * 2 - Log the current best map for each child count.
808 * More verbose includes most map fields.
809 * 3 - Log the current best map for each child count.
810 * Very verbose all fields including the full map.
811 */
812 verbose++;
813 break;
814 case ':':
815 (void) fprintf(stderr,
816 "missing argument for '%c' option\n", optopt);
817 draid_usage();
818 break;
819 case '?':
820 (void) fprintf(stderr, "invalid option '%c'\n",
821 optopt);
822 draid_usage();
823 break;
824 }
825 }
826
827 if (argc > optind)
828 strncpy(filename, argv[optind], MAXPATHLEN - 1);
829 else {
830 (void) fprintf(stderr, "A FILE must be specified.\n");
831 return (1);
832 }
833
834 restart:
835 /*
836 * Start with a fresh seed from /dev/urandom.
837 */
838 fd = open("/dev/urandom", O_RDONLY);
839 if (fd < 0) {
840 printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
841 return (1);
842 } else {
843 ssize_t bytes = sizeof (map_seed);
844 ssize_t bytes_read = 0;
845
846 while (bytes_read < bytes) {
847 ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read,
848 bytes - bytes_read);
849 if (rc < 0) {
850 printf("Unable to read /dev/urandom: %s\n:",
851 strerror(errno));
852 return (1);
853 }
854 bytes_read += rc;
855 }
856
857 (void) close(fd);
858 }
859
860 if (restarts == 0)
861 printf("Writing generated mappings to '%s':\n", filename);
862
863 /*
864 * Generate maps for all requested child counts. The best map for
865 * each child count is written out to the specified file. If the file
866 * already contains a better mapping this map will not be added.
867 */
868 for (uint64_t children = min_children;
869 children <= max_children; children++) {
870 char key[8] = { 0 };
871 draid_map_t *map;
872 double worst_ratio = 1000.0;
873 double avg_ratio = 1000.0;
874
875 error = eval_maps(children, passes, &map_seed, &map,
876 &worst_ratio, &avg_ratio);
877 if (error) {
878 printf("Error eval_maps(): %s\n", strerror(error));
879 return (1);
880 }
881
882 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
883 printf("Error ratio < 1.0: worst_ratio = %2.03f "
884 "avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
885 return (1);
886 }
887
888 snprintf(key, 7, "%llu", (u_longlong_t)children);
889 error = write_map_key(filename, key, map, worst_ratio,
890 avg_ratio);
891 if (error == 0) {
892 /* The new map was added to the file. */
893 dump_map(map, key, worst_ratio, avg_ratio,
894 MAX(verbose, 1));
895 } else if (error == EEXIST) {
896 /* The existing map was preferable and kept. */
897 if (verbose > 0)
898 dump_map_key(filename, key, verbose);
899 } else {
900 printf("Error write_map_key(): %s\n", strerror(error));
901 return (1);
902 }
903
904 free_map(map);
905 }
906
907 /*
908 * When the continuous option is set restart at the minimum number of
909 * children instead of exiting. This option is useful as a mechanism
910 * to continuous try and refine the discovered permutations.
911 */
912 if (continuous) {
913 restarts++;
914 printf("Restarting by request (-c): %d\n", restarts);
915 goto restart;
916 }
917
918 return (0);
919 }
920
921 /*
922 * Verify each map in the file by generating its in-memory permutation array
923 * and comfirming its checksum is correct.
924 */
925 static int
926 draid_verify(int argc, char *argv[])
927 {
928 char filename[MAXPATHLEN] = {0};
929 int n = 0, c, error, verbose = 1;
930 int check_ratios = 0;
931
932 while ((c = getopt(argc, argv, ":rv")) != -1) {
933 switch (c) {
934 case 'r':
935 check_ratios++;
936 break;
937 case 'v':
938 verbose++;
939 break;
940 case ':':
941 (void) fprintf(stderr,
942 "missing argument for '%c' option\n", optopt);
943 draid_usage();
944 break;
945 case '?':
946 (void) fprintf(stderr, "invalid option '%c'\n",
947 optopt);
948 draid_usage();
949 break;
950 }
951 }
952
953 if (argc > optind) {
954 char *abspath = malloc(MAXPATHLEN);
955 if (abspath == NULL)
956 return (ENOMEM);
957
958 if (realpath(argv[optind], abspath) != NULL)
959 strncpy(filename, abspath, MAXPATHLEN - 1);
960 else
961 strncpy(filename, argv[optind], MAXPATHLEN - 1);
962
963 free(abspath);
964 } else {
965 (void) fprintf(stderr, "A FILE must be specified.\n");
966 return (1);
967 }
968
969 printf("Verifying permutation maps: '%s'\n", filename);
970
971 /*
972 * Lookup hardcoded permutation map for each valid number of children
973 * and verify a generated map has the correct checksum. Then compare
974 * the generated map values with the nvlist map values read from the
975 * reference file to cross-check the permutation.
976 */
977 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
978 children <= VDEV_DRAID_MAX_CHILDREN;
979 children++) {
980 draid_map_t *map;
981 char key[8] = {0};
982
983 snprintf(key, 8, "%llu", (u_longlong_t)children);
984
985 error = alloc_fixed_map(children, &map);
986 if (error) {
987 printf("Error alloc_fixed_map() failed: %s\n",
988 error == ECKSUM ? "Invalid checksum" :
989 strerror(error));
990 return (1);
991 }
992
993 uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
994 uint8_t *nv_perms;
995 nvlist_t *cfg;
996 uint_t c;
997
998 error = read_map_key(filename, key, &cfg);
999 if (error != 0) {
1000 printf("Error read_map_key() failed: %s\n",
1001 strerror(error));
1002 free_map(map);
1003 return (1);
1004 }
1005
1006 nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1007 nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1008 nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1009 nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1010 nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
1011
1012 /*
1013 * Compare draid_map_t and nvlist reference values.
1014 */
1015 if (map->dm_seed != nv_seed) {
1016 printf("Error different seeds: 0x%016llx != "
1017 "0x%016llx\n", (u_longlong_t)map->dm_seed,
1018 (u_longlong_t)nv_seed);
1019 error = EINVAL;
1020 }
1021
1022 if (map->dm_checksum != nv_checksum) {
1023 printf("Error different checksums: 0x%016llx "
1024 "!= 0x%016llx\n",
1025 (u_longlong_t)map->dm_checksum,
1026 (u_longlong_t)nv_checksum);
1027 error = EINVAL;
1028 }
1029
1030 if (map->dm_children != nv_children) {
1031 printf("Error different children: %llu "
1032 "!= %llu\n", (u_longlong_t)map->dm_children,
1033 (u_longlong_t)nv_children);
1034 error = EINVAL;
1035 }
1036
1037 if (map->dm_nperms != nv_nperms) {
1038 printf("Error different nperms: %llu "
1039 "!= %llu\n", (u_longlong_t)map->dm_nperms,
1040 (u_longlong_t)nv_nperms);
1041 error = EINVAL;
1042 }
1043
1044 for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
1045 if (map->dm_perms[i] != nv_perms[i]) {
1046 printf("Error different perms[%llu]: "
1047 "%d != %d\n", (u_longlong_t)i,
1048 (int)map->dm_perms[i],
1049 (int)nv_perms[i]);
1050 error = EINVAL;
1051 break;
1052 }
1053 }
1054
1055 /*
1056 * For good measure recalculate the worst and average
1057 * ratios and confirm they match the nvlist values.
1058 */
1059 if (check_ratios) {
1060 uint64_t nv_worst_ratio, nv_avg_ratio;
1061 double worst_ratio, avg_ratio;
1062
1063 eval_decluster(map, &worst_ratio, &avg_ratio);
1064
1065 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1066 MAP_WORST_RATIO);
1067 nv_avg_ratio = fnvlist_lookup_uint64(cfg,
1068 MAP_AVG_RATIO);
1069
1070 if (worst_ratio < 1.0 || avg_ratio < 1.0) {
1071 printf("Error ratio out of range %2.03f, "
1072 "%2.03f\n", worst_ratio, avg_ratio);
1073 error = EINVAL;
1074 }
1075
1076 if ((uint64_t)(worst_ratio * 1000.0) !=
1077 nv_worst_ratio) {
1078 printf("Error different worst_ratio %2.03f "
1079 "!= %2.03f\n", (double)nv_worst_ratio /
1080 1000.0, worst_ratio);
1081 error = EINVAL;
1082 }
1083
1084 if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
1085 printf("Error different average_ratio %2.03f "
1086 "!= %2.03f\n", (double)nv_avg_ratio /
1087 1000.0, avg_ratio);
1088 error = EINVAL;
1089 }
1090 }
1091
1092 if (error) {
1093 free_map(map);
1094 nvlist_free(cfg);
1095 return (1);
1096 }
1097
1098 if (verbose > 0) {
1099 printf("- %llu children: good\n",
1100 (u_longlong_t)children);
1101 }
1102 n++;
1103
1104 free_map(map);
1105 nvlist_free(cfg);
1106 }
1107
1108 if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
1109 printf("Error permutation maps missing: %d / %d checked\n",
1110 n, VDEV_DRAID_MAX_CHILDREN - 1);
1111 return (1);
1112 }
1113
1114 printf("Successfully verified %d / %d permutation maps\n",
1115 n, VDEV_DRAID_MAX_CHILDREN - 1);
1116
1117 return (0);
1118 }
1119
1120 /*
1121 * Dump the contents of the specified mapping(s) for inspection.
1122 */
1123 static int
1124 draid_dump(int argc, char *argv[])
1125 {
1126 char filename[MAXPATHLEN] = {0};
1127 int c, error, verbose = 1;
1128 int min_children = VDEV_DRAID_MIN_CHILDREN;
1129 int max_children = VDEV_DRAID_MAX_CHILDREN;
1130
1131 while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
1132 switch (c) {
1133 case 'm':
1134 min_children = (int)strtol(optarg, NULL, 0);
1135 if (min_children < 2) {
1136 (void) fprintf(stderr, "A minimum of 2 "
1137 "children are required.\n");
1138 return (1);
1139 }
1140
1141 break;
1142 case 'n':
1143 max_children = (int)strtol(optarg, NULL, 0);
1144 if (max_children > VDEV_DRAID_MAX_CHILDREN) {
1145 (void) fprintf(stderr, "A maximum of %d "
1146 "children are allowed.\n",
1147 VDEV_DRAID_MAX_CHILDREN);
1148 return (1);
1149 }
1150 break;
1151 case 'v':
1152 verbose++;
1153 break;
1154 case ':':
1155 (void) fprintf(stderr,
1156 "missing argument for '%c' option\n", optopt);
1157 draid_usage();
1158 break;
1159 case '?':
1160 (void) fprintf(stderr, "invalid option '%c'\n",
1161 optopt);
1162 draid_usage();
1163 break;
1164 }
1165 }
1166
1167 if (argc > optind)
1168 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1169 else {
1170 (void) fprintf(stderr, "A FILE must be specified.\n");
1171 return (1);
1172 }
1173
1174 /*
1175 * Dump maps for the requested child counts.
1176 */
1177 for (uint64_t children = min_children;
1178 children <= max_children; children++) {
1179 char key[8] = { 0 };
1180
1181 snprintf(key, 7, "%llu", (u_longlong_t)children);
1182 error = dump_map_key(filename, key, verbose);
1183 if (error) {
1184 printf("Error dump_map_key(): %s\n", strerror(error));
1185 return (1);
1186 }
1187 }
1188
1189 return (0);
1190 }
1191
1192 /*
1193 * Print all of the mappings as a C formatted draid_map_t array. This table
1194 * is found in the module/zcommon/zfs_draid.c file and is the definitive
1195 * source for all mapping used by dRAID. It cannot be updated without
1196 * changing the dRAID on disk format.
1197 */
1198 static int
1199 draid_table(int argc, char *argv[])
1200 {
1201 char filename[MAXPATHLEN] = {0};
1202 int error;
1203
1204 if (argc > optind)
1205 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1206 else {
1207 (void) fprintf(stderr, "A FILE must be specified.\n");
1208 return (1);
1209 }
1210
1211 printf("static const draid_map_t "
1212 "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
1213
1214 for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
1215 children <= VDEV_DRAID_MAX_CHILDREN;
1216 children++) {
1217 uint64_t seed, checksum, nperms, avg_ratio;
1218 nvlist_t *cfg;
1219 char key[8] = {0};
1220
1221 snprintf(key, 8, "%llu", (u_longlong_t)children);
1222
1223 error = read_map_key(filename, key, &cfg);
1224 if (error != 0) {
1225 printf("Error read_map_key() failed: %s\n",
1226 strerror(error));
1227 return (1);
1228 }
1229
1230 seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
1231 checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
1232 children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
1233 nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
1234 avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
1235
1236 printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
1237 "/* %2.03f */\n", (u_longlong_t)children,
1238 (u_longlong_t)nperms, (u_longlong_t)seed,
1239 (u_longlong_t)checksum, (double)avg_ratio / 1000.0);
1240
1241 nvlist_free(cfg);
1242 }
1243
1244 printf("};\n");
1245
1246 return (0);
1247 }
1248
1249 static int
1250 draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
1251 {
1252 nvlist_t *srccfgs;
1253 nvpair_t *elem = NULL;
1254 int error, merged = 0;
1255
1256 error = read_map(srcfilename, &srccfgs);
1257 if (error != 0)
1258 return (error);
1259
1260 while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
1261 uint64_t nv_worst_ratio;
1262 uint64_t allcfg_worst_ratio;
1263 nvlist_t *cfg, *allcfg;
1264 char *key;
1265
1266 switch (nvpair_type(elem)) {
1267 case DATA_TYPE_NVLIST:
1268
1269 (void) nvpair_value_nvlist(elem, &cfg);
1270 key = nvpair_name(elem);
1271
1272 nv_worst_ratio = fnvlist_lookup_uint64(cfg,
1273 MAP_WORST_RATIO);
1274
1275 error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
1276 if (error == 0) {
1277 allcfg_worst_ratio = fnvlist_lookup_uint64(
1278 allcfg, MAP_WORST_RATIO);
1279
1280 if (nv_worst_ratio < allcfg_worst_ratio) {
1281 fnvlist_remove(allcfgs, key);
1282 error = nvlist_add_nvlist(allcfgs,
1283 key, cfg);
1284 merged++;
1285 }
1286 } else if (error == ENOENT) {
1287 error = nvlist_add_nvlist(allcfgs, key, cfg);
1288 merged++;
1289 } else {
1290 return (error);
1291 }
1292
1293 break;
1294 default:
1295 continue;
1296 }
1297 }
1298
1299 nvlist_free(srccfgs);
1300
1301 *mergedp = merged;
1302
1303 return (0);
1304 }
1305
1306 /*
1307 * Merge the best map for each child count found in the listed files into
1308 * a new file. This allows 'draid generate' to be run in parallel and for
1309 * the results maps to be combined.
1310 */
1311 static int
1312 draid_merge(int argc, char *argv[])
1313 {
1314 char filename[MAXPATHLEN] = {0};
1315 int c, error, total_merged = 0, verbose = 0;
1316 nvlist_t *allcfgs;
1317
1318 while ((c = getopt(argc, argv, ":v")) != -1) {
1319 switch (c) {
1320 case 'v':
1321 verbose++;
1322 break;
1323 case ':':
1324 (void) fprintf(stderr,
1325 "missing argument for '%c' option\n", optopt);
1326 draid_usage();
1327 break;
1328 case '?':
1329 (void) fprintf(stderr, "invalid option '%c'\n",
1330 optopt);
1331 draid_usage();
1332 break;
1333 }
1334 }
1335
1336 if (argc < 4) {
1337 (void) fprintf(stderr,
1338 "A FILE and multiple SRCs must be specified.\n");
1339 return (1);
1340 }
1341
1342 strncpy(filename, argv[optind], MAXPATHLEN - 1);
1343 optind++;
1344
1345 error = read_map(filename, &allcfgs);
1346 if (error == ENOENT) {
1347 allcfgs = fnvlist_alloc();
1348 } else if (error != 0) {
1349 printf("Error read_map(): %s\n", strerror(error));
1350 return (error);
1351 }
1352
1353 while (optind < argc) {
1354 char srcfilename[MAXPATHLEN] = {0};
1355 int merged = 0;
1356
1357 strncpy(srcfilename, argv[optind], MAXPATHLEN - 1);
1358
1359 error = draid_merge_impl(allcfgs, srcfilename, &merged);
1360 if (error) {
1361 printf("Error draid_merge_impl(): %s\n",
1362 strerror(error));
1363 nvlist_free(allcfgs);
1364 return (1);
1365 }
1366
1367 total_merged += merged;
1368 printf("Merged %d key(s) from '%s' into '%s'\n", merged,
1369 srcfilename, filename);
1370
1371 optind++;
1372 }
1373
1374 if (total_merged > 0)
1375 write_map(filename, allcfgs);
1376
1377 printf("Merged a total of %d key(s) into '%s'\n", total_merged,
1378 filename);
1379
1380 nvlist_free(allcfgs);
1381
1382 return (0);
1383 }
1384
1385 int
1386 main(int argc, char *argv[])
1387 {
1388 if (argc < 2)
1389 draid_usage();
1390
1391 char *subcommand = argv[1];
1392
1393 if (strcmp(subcommand, "generate") == 0) {
1394 return (draid_generate(argc - 1, argv + 1));
1395 } else if (strcmp(subcommand, "verify") == 0) {
1396 return (draid_verify(argc - 1, argv + 1));
1397 } else if (strcmp(subcommand, "dump") == 0) {
1398 return (draid_dump(argc - 1, argv + 1));
1399 } else if (strcmp(subcommand, "table") == 0) {
1400 return (draid_table(argc - 1, argv + 1));
1401 } else if (strcmp(subcommand, "merge") == 0) {
1402 return (draid_merge(argc - 1, argv + 1));
1403 } else {
1404 draid_usage();
1405 }
1406 }