]>
Commit | Line | Data |
---|---|---|
b2255edc BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
b2255edc BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2018 Intel Corporation. | |
23 | * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. | |
24 | */ | |
25 | ||
26 | #include <stdio.h> | |
27 | #include <zlib.h> | |
28 | #include <zfs_fletcher.h> | |
29 | #include <sys/vdev_draid.h> | |
30 | #include <sys/nvpair.h> | |
31 | #include <sys/stat.h> | |
32 | ||
33 | /* | |
34 | * The number of rows to generate for new permutation maps. | |
35 | */ | |
36 | #define MAP_ROWS_DEFAULT 256 | |
37 | ||
38 | /* | |
39 | * Key values for dRAID maps when stored as nvlists. | |
40 | */ | |
41 | #define MAP_SEED "seed" | |
42 | #define MAP_CHECKSUM "checksum" | |
43 | #define MAP_WORST_RATIO "worst_ratio" | |
44 | #define MAP_AVG_RATIO "avg_ratio" | |
45 | #define MAP_CHILDREN "children" | |
46 | #define MAP_NPERMS "nperms" | |
47 | #define MAP_PERMS "perms" | |
48 | ||
49 | static void | |
50 | draid_usage(void) | |
51 | { | |
52 | (void) fprintf(stderr, | |
53 | "usage: draid command args ...\n" | |
54 | "Available commands are:\n" | |
55 | "\n" | |
56 | "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" | |
57 | "\tdraid verify [-rv] FILE\n" | |
58 | "\tdraid dump [-v] [-m min] [-n max] FILE\n" | |
59 | "\tdraid table FILE\n" | |
60 | "\tdraid merge FILE SRC SRC...\n"); | |
61 | exit(1); | |
62 | } | |
63 | ||
64 | static int | |
65 | read_map(const char *filename, nvlist_t **allcfgs) | |
66 | { | |
67 | int block_size = 131072; | |
68 | int buf_size = 131072; | |
69 | int tmp_size, error; | |
70 | char *tmp_buf; | |
71 | ||
72 | struct stat64 stat; | |
73 | if (lstat64(filename, &stat) != 0) | |
74 | return (errno); | |
75 | ||
76 | if (stat.st_size == 0 || | |
77 | !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { | |
78 | return (EINVAL); | |
79 | } | |
80 | ||
81 | gzFile fp = gzopen(filename, "rb"); | |
82 | if (fp == Z_NULL) | |
83 | return (errno); | |
84 | ||
85 | char *buf = malloc(buf_size); | |
86 | if (buf == NULL) { | |
87 | (void) gzclose(fp); | |
88 | return (ENOMEM); | |
89 | } | |
90 | ||
91 | ssize_t rc, bytes = 0; | |
92 | while (!gzeof(fp)) { | |
93 | rc = gzread(fp, buf + bytes, block_size); | |
94 | if ((rc < 0) || (rc == 0 && !gzeof(fp))) { | |
95 | free(buf); | |
b2255edc | 96 | (void) gzerror(fp, &error); |
570ca444 | 97 | (void) gzclose(fp); |
b2255edc BB |
98 | return (error); |
99 | } else { | |
100 | bytes += rc; | |
101 | ||
102 | if (bytes + block_size >= buf_size) { | |
103 | tmp_size = 2 * buf_size; | |
104 | tmp_buf = malloc(tmp_size); | |
105 | if (tmp_buf == NULL) { | |
106 | free(buf); | |
107 | (void) gzclose(fp); | |
108 | return (ENOMEM); | |
109 | } | |
110 | ||
111 | memcpy(tmp_buf, buf, bytes); | |
112 | free(buf); | |
113 | buf = tmp_buf; | |
114 | buf_size = tmp_size; | |
115 | } | |
116 | } | |
117 | } | |
118 | ||
119 | (void) gzclose(fp); | |
120 | ||
121 | error = nvlist_unpack(buf, bytes, allcfgs, 0); | |
122 | free(buf); | |
123 | ||
124 | return (error); | |
125 | } | |
126 | ||
127 | /* | |
128 | * Read a map from the specified filename. A file contains multiple maps | |
129 | * which are indexed by the number of children. The caller is responsible | |
130 | * for freeing the configuration returned. | |
131 | */ | |
132 | static int | |
a926aab9 | 133 | read_map_key(const char *filename, const char *key, nvlist_t **cfg) |
b2255edc BB |
134 | { |
135 | nvlist_t *allcfgs, *foundcfg = NULL; | |
136 | int error; | |
137 | ||
138 | error = read_map(filename, &allcfgs); | |
139 | if (error != 0) | |
140 | return (error); | |
141 | ||
2a493a4c | 142 | (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg); |
b2255edc BB |
143 | if (foundcfg != NULL) { |
144 | nvlist_dup(foundcfg, cfg, KM_SLEEP); | |
145 | error = 0; | |
146 | } else { | |
147 | error = ENOENT; | |
148 | } | |
149 | ||
150 | nvlist_free(allcfgs); | |
151 | ||
152 | return (error); | |
153 | } | |
154 | ||
155 | /* | |
156 | * Write all mappings to the map file. | |
157 | */ | |
158 | static int | |
159 | write_map(const char *filename, nvlist_t *allcfgs) | |
160 | { | |
161 | size_t buflen = 0; | |
162 | int error; | |
163 | ||
164 | error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); | |
165 | if (error) | |
166 | return (error); | |
167 | ||
168 | char *buf = malloc(buflen); | |
169 | if (buf == NULL) | |
170 | return (ENOMEM); | |
171 | ||
172 | error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); | |
173 | if (error) { | |
174 | free(buf); | |
175 | return (error); | |
176 | } | |
177 | ||
178 | /* | |
179 | * Atomically update the file using a temporary file and the | |
180 | * traditional unlink then rename steps. This code provides | |
181 | * no locking, it only guarantees the packed nvlist on disk | |
182 | * is updated atomically and is internally consistent. | |
183 | */ | |
955bf4dc | 184 | char *tmpname = calloc(1, MAXPATHLEN); |
b2255edc BB |
185 | if (tmpname == NULL) { |
186 | free(buf); | |
187 | return (ENOMEM); | |
188 | } | |
189 | ||
190 | snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); | |
191 | ||
192 | int fd = mkstemp(tmpname); | |
193 | if (fd < 0) { | |
194 | error = errno; | |
195 | free(buf); | |
196 | free(tmpname); | |
197 | return (error); | |
198 | } | |
199 | (void) close(fd); | |
200 | ||
201 | gzFile fp = gzopen(tmpname, "w9b"); | |
202 | if (fp == Z_NULL) { | |
203 | error = errno; | |
204 | free(buf); | |
205 | free(tmpname); | |
206 | return (errno); | |
207 | } | |
208 | ||
209 | ssize_t rc, bytes = 0; | |
210 | while (bytes < buflen) { | |
211 | size_t size = MIN(buflen - bytes, 131072); | |
212 | rc = gzwrite(fp, buf + bytes, size); | |
213 | if (rc < 0) { | |
214 | free(buf); | |
215 | (void) gzerror(fp, &error); | |
216 | (void) gzclose(fp); | |
217 | (void) unlink(tmpname); | |
218 | free(tmpname); | |
219 | return (error); | |
220 | } else if (rc == 0) { | |
221 | break; | |
222 | } else { | |
223 | bytes += rc; | |
224 | } | |
225 | } | |
226 | ||
227 | free(buf); | |
228 | (void) gzclose(fp); | |
229 | ||
230 | if (bytes != buflen) { | |
231 | (void) unlink(tmpname); | |
232 | free(tmpname); | |
233 | return (EIO); | |
234 | } | |
235 | ||
236 | /* | |
237 | * Unlink the previous config file and replace it with the updated | |
238 | * version. If we're able to unlink the file then directory is | |
239 | * writable by us and the subsequent rename should never fail. | |
240 | */ | |
241 | error = unlink(filename); | |
242 | if (error != 0 && errno != ENOENT) { | |
243 | error = errno; | |
244 | (void) unlink(tmpname); | |
245 | free(tmpname); | |
246 | return (error); | |
247 | } | |
248 | ||
249 | error = rename(tmpname, filename); | |
250 | if (error != 0) { | |
251 | error = errno; | |
252 | (void) unlink(tmpname); | |
253 | free(tmpname); | |
254 | return (error); | |
255 | } | |
256 | ||
257 | free(tmpname); | |
258 | ||
259 | return (0); | |
260 | } | |
261 | ||
262 | /* | |
263 | * Add the dRAID map to the file and write it out. | |
264 | */ | |
265 | static int | |
266 | write_map_key(const char *filename, char *key, draid_map_t *map, | |
267 | double worst_ratio, double avg_ratio) | |
268 | { | |
269 | nvlist_t *nv_cfg, *allcfgs; | |
270 | int error; | |
271 | ||
272 | /* | |
273 | * Add the configuration to an existing or new file. The new | |
274 | * configuration will replace an existing configuration with the | |
275 | * same key if it has a lower ratio and is therefore better. | |
276 | */ | |
277 | error = read_map(filename, &allcfgs); | |
278 | if (error == ENOENT) { | |
279 | allcfgs = fnvlist_alloc(); | |
280 | } else if (error != 0) { | |
281 | return (error); | |
282 | } | |
283 | ||
284 | error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); | |
285 | if (error == 0) { | |
286 | uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, | |
287 | MAP_WORST_RATIO); | |
288 | double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; | |
289 | ||
290 | if (worst_ratio < nv_worst_ratio) { | |
291 | /* Replace old map with the more balanced new map. */ | |
292 | fnvlist_remove(allcfgs, key); | |
293 | } else { | |
294 | /* The old map is preferable, keep it. */ | |
295 | nvlist_free(allcfgs); | |
296 | return (EEXIST); | |
297 | } | |
298 | } | |
299 | ||
300 | nvlist_t *cfg = fnvlist_alloc(); | |
301 | fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); | |
302 | fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); | |
303 | fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); | |
304 | fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); | |
305 | fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, | |
306 | map->dm_children * map->dm_nperms * sizeof (uint8_t)); | |
307 | ||
308 | fnvlist_add_uint64(cfg, MAP_WORST_RATIO, | |
309 | (uint64_t)(worst_ratio * 1000.0)); | |
310 | fnvlist_add_uint64(cfg, MAP_AVG_RATIO, | |
311 | (uint64_t)(avg_ratio * 1000.0)); | |
312 | ||
313 | error = nvlist_add_nvlist(allcfgs, key, cfg); | |
314 | if (error == 0) | |
315 | error = write_map(filename, allcfgs); | |
316 | ||
317 | nvlist_free(cfg); | |
318 | nvlist_free(allcfgs); | |
319 | return (error); | |
320 | } | |
321 | ||
322 | static void | |
a926aab9 AZ |
323 | dump_map(draid_map_t *map, const char *key, double worst_ratio, |
324 | double avg_ratio, int verbose) | |
b2255edc BB |
325 | { |
326 | if (verbose == 0) { | |
327 | return; | |
328 | } else if (verbose == 1) { | |
329 | printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " | |
330 | "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, | |
331 | worst_ratio, avg_ratio); | |
332 | return; | |
333 | } else { | |
334 | printf(" \"%s\":\n" | |
335 | " seed: 0x%016llx\n" | |
336 | " checksum: 0x%016llx\n" | |
337 | " worst_ratio: %2.03f\n" | |
338 | " avg_ratio: %2.03f\n" | |
339 | " children: %llu\n" | |
340 | " nperms: %llu\n", | |
341 | key, (u_longlong_t)map->dm_seed, | |
342 | (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, | |
343 | (u_longlong_t)map->dm_children, | |
344 | (u_longlong_t)map->dm_nperms); | |
345 | ||
346 | if (verbose > 2) { | |
347 | printf(" perms = {\n"); | |
348 | for (int i = 0; i < map->dm_nperms; i++) { | |
349 | printf(" { "); | |
350 | for (int j = 0; j < map->dm_children; j++) { | |
351 | printf("%3d%s ", map->dm_perms[ | |
352 | i * map->dm_children + j], | |
353 | j < map->dm_children - 1 ? | |
354 | "," : ""); | |
355 | } | |
356 | printf(" },\n"); | |
357 | } | |
358 | printf(" }\n"); | |
359 | } else if (verbose == 2) { | |
360 | printf(" draid_perms = <omitted>\n"); | |
361 | } | |
362 | } | |
363 | } | |
364 | ||
365 | static void | |
a926aab9 | 366 | dump_map_nv(const char *key, nvlist_t *cfg, int verbose) |
b2255edc BB |
367 | { |
368 | draid_map_t map; | |
369 | uint_t c; | |
370 | ||
371 | uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); | |
372 | uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); | |
373 | ||
374 | map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); | |
375 | map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); | |
376 | map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); | |
377 | map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); | |
2a493a4c | 378 | map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c); |
b2255edc BB |
379 | |
380 | dump_map(&map, key, (double)worst_ratio / 1000.0, | |
381 | avg_ratio / 1000.0, verbose); | |
382 | } | |
383 | ||
384 | /* | |
385 | * Print a summary of the mapping. | |
386 | */ | |
387 | static int | |
a926aab9 | 388 | dump_map_key(const char *filename, const char *key, int verbose) |
b2255edc BB |
389 | { |
390 | nvlist_t *cfg; | |
391 | int error; | |
392 | ||
393 | error = read_map_key(filename, key, &cfg); | |
394 | if (error != 0) | |
395 | return (error); | |
396 | ||
397 | dump_map_nv(key, cfg, verbose); | |
398 | ||
399 | return (0); | |
400 | } | |
401 | ||
402 | /* | |
403 | * Allocate a new permutation map for evaluation. | |
404 | */ | |
405 | static int | |
406 | alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, | |
407 | draid_map_t **mapp) | |
408 | { | |
409 | draid_map_t *map; | |
410 | int error; | |
411 | ||
412 | map = malloc(sizeof (draid_map_t)); | |
413 | if (map == NULL) | |
414 | return (ENOMEM); | |
415 | ||
416 | map->dm_children = children; | |
417 | map->dm_nperms = nperms; | |
418 | map->dm_seed = seed; | |
419 | map->dm_checksum = 0; | |
420 | ||
421 | error = vdev_draid_generate_perms(map, &map->dm_perms); | |
422 | if (error) { | |
423 | free(map); | |
424 | return (error); | |
425 | } | |
426 | ||
427 | *mapp = map; | |
428 | ||
429 | return (0); | |
430 | } | |
431 | ||
432 | /* | |
433 | * Allocate the fixed permutation map for N children. | |
434 | */ | |
435 | static int | |
436 | alloc_fixed_map(uint64_t children, draid_map_t **mapp) | |
437 | { | |
438 | const draid_map_t *fixed_map; | |
439 | draid_map_t *map; | |
440 | int error; | |
441 | ||
442 | error = vdev_draid_lookup_map(children, &fixed_map); | |
443 | if (error) | |
444 | return (error); | |
445 | ||
446 | map = malloc(sizeof (draid_map_t)); | |
447 | if (map == NULL) | |
448 | return (ENOMEM); | |
449 | ||
450 | memcpy(map, fixed_map, sizeof (draid_map_t)); | |
451 | VERIFY3U(map->dm_checksum, !=, 0); | |
452 | ||
453 | error = vdev_draid_generate_perms(map, &map->dm_perms); | |
454 | if (error) { | |
455 | free(map); | |
456 | return (error); | |
457 | } | |
458 | ||
459 | *mapp = map; | |
460 | ||
461 | return (0); | |
462 | } | |
463 | ||
464 | /* | |
465 | * Free a permutation map. | |
466 | */ | |
467 | static void | |
468 | free_map(draid_map_t *map) | |
469 | { | |
470 | free(map->dm_perms); | |
471 | free(map); | |
472 | } | |
473 | ||
474 | /* | |
475 | * Check if dev is in the provided list of faulted devices. | |
476 | */ | |
477 | static inline boolean_t | |
478 | is_faulted(int *faulted_devs, int nfaulted, int dev) | |
479 | { | |
480 | for (int i = 0; i < nfaulted; i++) | |
481 | if (faulted_devs[i] == dev) | |
482 | return (B_TRUE); | |
483 | ||
484 | return (B_FALSE); | |
485 | } | |
486 | ||
487 | /* | |
488 | * Evaluate how resilvering I/O will be distributed given a list of faulted | |
489 | * vdevs. As a simplification we assume one IO is sufficient to repair each | |
490 | * damaged device in a group. | |
491 | */ | |
492 | static double | |
493 | eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, | |
494 | int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) | |
495 | { | |
496 | uint64_t children = map->dm_children; | |
497 | uint64_t ngroups = 1; | |
498 | uint64_t ndisks = children - nspares; | |
499 | ||
500 | /* | |
501 | * Calculate the minimum number of groups required to fill a slice. | |
502 | */ | |
503 | while (ngroups * (groupwidth) % (children - nspares) != 0) | |
504 | ngroups++; | |
505 | ||
506 | int *ios = calloc(map->dm_children, sizeof (uint64_t)); | |
507 | ||
570ca444 RY |
508 | ASSERT3P(ios, !=, NULL); |
509 | ||
b2255edc BB |
510 | /* Resilver all rows */ |
511 | for (int i = 0; i < map->dm_nperms; i++) { | |
512 | uint8_t *row = &map->dm_perms[i * map->dm_children]; | |
513 | ||
514 | /* Resilver all groups with faulted drives */ | |
515 | for (int j = 0; j < ngroups; j++) { | |
516 | uint64_t spareidx = map->dm_children - nspares; | |
517 | boolean_t repair_needed = B_FALSE; | |
518 | ||
519 | /* See if any devices in this group are faulted */ | |
520 | uint64_t groupstart = (j * groupwidth) % ndisks; | |
521 | ||
522 | for (int k = 0; k < groupwidth; k++) { | |
523 | uint64_t groupidx = (groupstart + k) % ndisks; | |
524 | ||
525 | repair_needed = is_faulted(faulted_devs, | |
526 | nfaulted, row[groupidx]); | |
527 | if (repair_needed) | |
528 | break; | |
529 | } | |
530 | ||
531 | if (repair_needed == B_FALSE) | |
532 | continue; | |
533 | ||
534 | /* | |
535 | * This group is degraded. Calculate the number of | |
536 | * reads the non-faulted drives require and the number | |
537 | * of writes to the distributed hot spare for this row. | |
538 | */ | |
539 | for (int k = 0; k < groupwidth; k++) { | |
540 | uint64_t groupidx = (groupstart + k) % ndisks; | |
541 | ||
542 | if (!is_faulted(faulted_devs, nfaulted, | |
543 | row[groupidx])) { | |
544 | ios[row[groupidx]]++; | |
545 | } else if (nspares > 0) { | |
546 | while (is_faulted(faulted_devs, | |
547 | nfaulted, row[spareidx])) { | |
548 | spareidx++; | |
549 | } | |
550 | ||
551 | ASSERT3U(spareidx, <, map->dm_children); | |
552 | ios[row[spareidx]]++; | |
553 | spareidx++; | |
554 | } | |
555 | } | |
556 | } | |
557 | } | |
558 | ||
559 | *min_child_ios = INT_MAX; | |
560 | *max_child_ios = 0; | |
561 | ||
562 | /* | |
563 | * Find the drives with fewest and most required I/O. These values | |
564 | * are used to calculate the imbalance ratio. To avoid returning an | |
565 | * infinite value for permutations which have children that perform | |
566 | * no IO a floor of 1 IO per child is set. This ensures a meaningful | |
567 | * ratio is returned for comparison and it is not an uncommon when | |
568 | * there are a large number of children. | |
569 | */ | |
570 | for (int i = 0; i < map->dm_children; i++) { | |
571 | ||
572 | if (is_faulted(faulted_devs, nfaulted, i)) { | |
573 | ASSERT0(ios[i]); | |
574 | continue; | |
575 | } | |
576 | ||
577 | if (ios[i] == 0) | |
578 | ios[i] = 1; | |
579 | ||
580 | if (ios[i] < *min_child_ios) | |
581 | *min_child_ios = ios[i]; | |
582 | ||
583 | if (ios[i] > *max_child_ios) | |
584 | *max_child_ios = ios[i]; | |
585 | } | |
586 | ||
587 | ASSERT3S(*min_child_ios, !=, INT_MAX); | |
588 | ASSERT3S(*max_child_ios, !=, 0); | |
589 | ||
590 | double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); | |
591 | ||
592 | free(ios); | |
593 | ||
594 | return (ratio); | |
595 | } | |
596 | ||
597 | /* | |
598 | * Evaluate the quality of the permutation mapping by considering possible | |
599 | * device failures. Returns the imbalance ratio for the worst mapping which | |
600 | * is defined to be the largest number of child IOs over the fewest number | |
601 | * child IOs. A value of 1.0 indicates the mapping is perfectly balance and | |
602 | * all children perform an equal amount of work during reconstruction. | |
603 | */ | |
604 | static void | |
605 | eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) | |
606 | { | |
607 | uint64_t children = map->dm_children; | |
608 | double worst_ratio = 1.0; | |
609 | double sum = 0; | |
610 | int worst_min_ios = 0, worst_max_ios = 0; | |
611 | int n = 0; | |
612 | ||
613 | /* | |
614 | * When there are only 2 children there can be no distributed | |
615 | * spare and no resilver to evaluate. Default to a ratio of 1.0 | |
616 | * for this degenerate case. | |
617 | */ | |
618 | if (children == VDEV_DRAID_MIN_CHILDREN) { | |
619 | *worst_ratiop = 1.0; | |
620 | *avg_ratiop = 1.0; | |
621 | return; | |
622 | } | |
623 | ||
624 | /* | |
625 | * Score the mapping as if it had either 1 or 2 distributed spares. | |
626 | */ | |
627 | for (int nspares = 1; nspares <= 2; nspares++) { | |
628 | uint64_t faults = nspares; | |
629 | ||
630 | /* | |
bf169e9f | 631 | * Score groupwidths up to 19. This value was chosen as the |
b2255edc BB |
632 | * largest reasonable width (16d+3p). dRAID pools may be still |
633 | * be created with wider stripes but they are not considered in | |
634 | * this analysis in order to optimize for the most common cases. | |
635 | */ | |
636 | for (uint64_t groupwidth = 2; | |
637 | groupwidth <= MIN(children - nspares, 19); | |
638 | groupwidth++) { | |
639 | int faulted_devs[2]; | |
640 | int min_ios, max_ios; | |
641 | ||
642 | /* | |
643 | * Score possible devices faults. This is limited | |
644 | * to exactly one fault per distributed spare for | |
645 | * the purposes of this similation. | |
646 | */ | |
647 | for (int f1 = 0; f1 < children; f1++) { | |
648 | faulted_devs[0] = f1; | |
649 | double ratio; | |
650 | ||
651 | if (faults == 1) { | |
652 | ratio = eval_resilver(map, groupwidth, | |
653 | nspares, faulted_devs, faults, | |
654 | &min_ios, &max_ios); | |
655 | ||
656 | if (ratio > worst_ratio) { | |
657 | worst_ratio = ratio; | |
658 | worst_min_ios = min_ios; | |
659 | worst_max_ios = max_ios; | |
660 | } | |
661 | ||
662 | sum += ratio; | |
663 | n++; | |
664 | } else if (faults == 2) { | |
665 | for (int f2 = f1 + 1; f2 < children; | |
666 | f2++) { | |
667 | faulted_devs[1] = f2; | |
668 | ||
669 | ratio = eval_resilver(map, | |
670 | groupwidth, nspares, | |
671 | faulted_devs, faults, | |
672 | &min_ios, &max_ios); | |
673 | ||
674 | if (ratio > worst_ratio) { | |
675 | worst_ratio = ratio; | |
676 | worst_min_ios = min_ios; | |
677 | worst_max_ios = max_ios; | |
678 | } | |
679 | ||
680 | sum += ratio; | |
681 | n++; | |
682 | } | |
683 | } | |
684 | } | |
685 | } | |
686 | } | |
687 | ||
688 | *worst_ratiop = worst_ratio; | |
689 | *avg_ratiop = sum / n; | |
690 | ||
691 | /* | |
692 | * Log the min/max io values for particularly unbalanced maps. | |
693 | * Since the maps are generated entirely randomly these are possible | |
694 | * be exceedingly unlikely. We log it for possible investigation. | |
695 | */ | |
696 | if (worst_ratio > 100.0) { | |
697 | dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); | |
698 | printf("worst_min_ios=%d worst_max_ios=%d\n", | |
699 | worst_min_ios, worst_max_ios); | |
700 | } | |
701 | } | |
702 | ||
703 | static int | |
704 | eval_maps(uint64_t children, int passes, uint64_t *map_seed, | |
705 | draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) | |
706 | { | |
707 | draid_map_t *best_map = NULL; | |
708 | double best_worst_ratio = 1000.0; | |
709 | double best_avg_ratio = 1000.0; | |
710 | ||
711 | /* | |
712 | * Perform the requested number of passes evaluating randomly | |
713 | * generated permutation maps. Only the best version is kept. | |
714 | */ | |
715 | for (int i = 0; i < passes; i++) { | |
716 | double worst_ratio, avg_ratio; | |
717 | draid_map_t *map; | |
718 | int error; | |
719 | ||
720 | /* | |
721 | * Calculate the next seed and generate a new candidate map. | |
722 | */ | |
723 | error = alloc_new_map(children, MAP_ROWS_DEFAULT, | |
724 | vdev_draid_rand(map_seed), &map); | |
f7bda2de RY |
725 | if (error) { |
726 | if (best_map != NULL) | |
727 | free_map(best_map); | |
b2255edc | 728 | return (error); |
f7bda2de | 729 | } |
b2255edc BB |
730 | |
731 | /* | |
732 | * Consider maps with a lower worst_ratio to be of higher | |
733 | * quality. Some maps may have a lower avg_ratio but they | |
734 | * are discarded since they might include some particularly | |
bf169e9f | 735 | * imbalanced permutations. The average is tracked to in |
b2255edc BB |
736 | * order to get a sense of the average permutation quality. |
737 | */ | |
738 | eval_decluster(map, &worst_ratio, &avg_ratio); | |
739 | ||
740 | if (best_map == NULL || worst_ratio < best_worst_ratio) { | |
741 | ||
742 | if (best_map != NULL) | |
743 | free_map(best_map); | |
744 | ||
745 | best_map = map; | |
746 | best_worst_ratio = worst_ratio; | |
747 | best_avg_ratio = avg_ratio; | |
748 | } else { | |
749 | free_map(map); | |
750 | } | |
751 | } | |
752 | ||
753 | /* | |
754 | * After determining the best map generate a checksum over the full | |
755 | * permutation array. This checksum is verified when opening a dRAID | |
756 | * pool to ensure the generated in memory permutations are correct. | |
757 | */ | |
758 | zio_cksum_t cksum; | |
759 | fletcher_4_native_varsize(best_map->dm_perms, | |
760 | sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, | |
761 | &cksum); | |
762 | best_map->dm_checksum = cksum.zc_word[0]; | |
763 | ||
764 | *best_mapp = best_map; | |
765 | *best_ratiop = best_worst_ratio; | |
766 | *avg_ratiop = best_avg_ratio; | |
767 | ||
768 | return (0); | |
769 | } | |
770 | ||
771 | static int | |
772 | draid_generate(int argc, char *argv[]) | |
773 | { | |
861166b0 | 774 | char filename[MAXPATHLEN] = {0}; |
570ca444 | 775 | uint64_t map_seed[2]; |
b2255edc BB |
776 | int c, fd, error, verbose = 0, passes = 1, continuous = 0; |
777 | int min_children = VDEV_DRAID_MIN_CHILDREN; | |
778 | int max_children = VDEV_DRAID_MAX_CHILDREN; | |
779 | int restarts = 0; | |
780 | ||
781 | while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { | |
782 | switch (c) { | |
783 | case 'c': | |
784 | continuous++; | |
785 | break; | |
786 | case 'm': | |
787 | min_children = (int)strtol(optarg, NULL, 0); | |
788 | if (min_children < VDEV_DRAID_MIN_CHILDREN) { | |
789 | (void) fprintf(stderr, "A minimum of 2 " | |
790 | "children are required.\n"); | |
791 | return (1); | |
792 | } | |
793 | ||
794 | break; | |
795 | case 'n': | |
796 | max_children = (int)strtol(optarg, NULL, 0); | |
797 | if (max_children > VDEV_DRAID_MAX_CHILDREN) { | |
798 | (void) fprintf(stderr, "A maximum of %d " | |
799 | "children are allowed.\n", | |
800 | VDEV_DRAID_MAX_CHILDREN); | |
801 | return (1); | |
802 | } | |
803 | break; | |
804 | case 'p': | |
805 | passes = (int)strtol(optarg, NULL, 0); | |
806 | break; | |
807 | case 'v': | |
808 | /* | |
809 | * 0 - Only log when a better map is added to the file. | |
810 | * 1 - Log the current best map for each child count. | |
811 | * Minimal output on a single summary line. | |
812 | * 2 - Log the current best map for each child count. | |
813 | * More verbose includes most map fields. | |
814 | * 3 - Log the current best map for each child count. | |
815 | * Very verbose all fields including the full map. | |
816 | */ | |
817 | verbose++; | |
818 | break; | |
819 | case ':': | |
820 | (void) fprintf(stderr, | |
821 | "missing argument for '%c' option\n", optopt); | |
822 | draid_usage(); | |
823 | break; | |
824 | case '?': | |
825 | (void) fprintf(stderr, "invalid option '%c'\n", | |
826 | optopt); | |
827 | draid_usage(); | |
828 | break; | |
829 | } | |
830 | } | |
831 | ||
861166b0 | 832 | if (argc > optind) |
7584fbe8 | 833 | strlcpy(filename, argv[optind], sizeof (filename)); |
861166b0 | 834 | else { |
b2255edc BB |
835 | (void) fprintf(stderr, "A FILE must be specified.\n"); |
836 | return (1); | |
837 | } | |
838 | ||
839 | restart: | |
840 | /* | |
841 | * Start with a fresh seed from /dev/urandom. | |
842 | */ | |
843 | fd = open("/dev/urandom", O_RDONLY); | |
844 | if (fd < 0) { | |
845 | printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); | |
846 | return (1); | |
847 | } else { | |
848 | ssize_t bytes = sizeof (map_seed); | |
849 | ssize_t bytes_read = 0; | |
850 | ||
851 | while (bytes_read < bytes) { | |
570ca444 | 852 | ssize_t rc = read(fd, ((char *)map_seed) + bytes_read, |
b2255edc BB |
853 | bytes - bytes_read); |
854 | if (rc < 0) { | |
855 | printf("Unable to read /dev/urandom: %s\n:", | |
856 | strerror(errno)); | |
ebe1d036 | 857 | close(fd); |
b2255edc BB |
858 | return (1); |
859 | } | |
860 | bytes_read += rc; | |
861 | } | |
862 | ||
863 | (void) close(fd); | |
864 | } | |
865 | ||
866 | if (restarts == 0) | |
867 | printf("Writing generated mappings to '%s':\n", filename); | |
868 | ||
869 | /* | |
870 | * Generate maps for all requested child counts. The best map for | |
871 | * each child count is written out to the specified file. If the file | |
872 | * already contains a better mapping this map will not be added. | |
873 | */ | |
874 | for (uint64_t children = min_children; | |
875 | children <= max_children; children++) { | |
876 | char key[8] = { 0 }; | |
877 | draid_map_t *map; | |
878 | double worst_ratio = 1000.0; | |
879 | double avg_ratio = 1000.0; | |
880 | ||
570ca444 | 881 | error = eval_maps(children, passes, map_seed, &map, |
b2255edc BB |
882 | &worst_ratio, &avg_ratio); |
883 | if (error) { | |
884 | printf("Error eval_maps(): %s\n", strerror(error)); | |
885 | return (1); | |
886 | } | |
887 | ||
888 | if (worst_ratio < 1.0 || avg_ratio < 1.0) { | |
889 | printf("Error ratio < 1.0: worst_ratio = %2.03f " | |
890 | "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); | |
891 | return (1); | |
892 | } | |
893 | ||
894 | snprintf(key, 7, "%llu", (u_longlong_t)children); | |
895 | error = write_map_key(filename, key, map, worst_ratio, | |
896 | avg_ratio); | |
897 | if (error == 0) { | |
898 | /* The new map was added to the file. */ | |
899 | dump_map(map, key, worst_ratio, avg_ratio, | |
900 | MAX(verbose, 1)); | |
901 | } else if (error == EEXIST) { | |
902 | /* The existing map was preferable and kept. */ | |
903 | if (verbose > 0) | |
904 | dump_map_key(filename, key, verbose); | |
905 | } else { | |
906 | printf("Error write_map_key(): %s\n", strerror(error)); | |
907 | return (1); | |
908 | } | |
909 | ||
910 | free_map(map); | |
911 | } | |
912 | ||
913 | /* | |
914 | * When the continuous option is set restart at the minimum number of | |
915 | * children instead of exiting. This option is useful as a mechanism | |
916 | * to continuous try and refine the discovered permutations. | |
917 | */ | |
918 | if (continuous) { | |
919 | restarts++; | |
920 | printf("Restarting by request (-c): %d\n", restarts); | |
921 | goto restart; | |
922 | } | |
923 | ||
924 | return (0); | |
925 | } | |
926 | ||
927 | /* | |
928 | * Verify each map in the file by generating its in-memory permutation array | |
929 | * and comfirming its checksum is correct. | |
930 | */ | |
931 | static int | |
932 | draid_verify(int argc, char *argv[]) | |
933 | { | |
861166b0 | 934 | char filename[MAXPATHLEN] = {0}; |
b2255edc BB |
935 | int n = 0, c, error, verbose = 1; |
936 | int check_ratios = 0; | |
937 | ||
938 | while ((c = getopt(argc, argv, ":rv")) != -1) { | |
939 | switch (c) { | |
940 | case 'r': | |
941 | check_ratios++; | |
942 | break; | |
943 | case 'v': | |
944 | verbose++; | |
945 | break; | |
946 | case ':': | |
947 | (void) fprintf(stderr, | |
948 | "missing argument for '%c' option\n", optopt); | |
949 | draid_usage(); | |
950 | break; | |
951 | case '?': | |
952 | (void) fprintf(stderr, "invalid option '%c'\n", | |
953 | optopt); | |
954 | draid_usage(); | |
955 | break; | |
956 | } | |
957 | } | |
958 | ||
959 | if (argc > optind) { | |
960 | char *abspath = malloc(MAXPATHLEN); | |
961 | if (abspath == NULL) | |
962 | return (ENOMEM); | |
963 | ||
b2255edc | 964 | if (realpath(argv[optind], abspath) != NULL) |
7584fbe8 | 965 | strlcpy(filename, abspath, sizeof (filename)); |
b2255edc | 966 | else |
7584fbe8 | 967 | strlcpy(filename, argv[optind], sizeof (filename)); |
b2255edc BB |
968 | |
969 | free(abspath); | |
970 | } else { | |
971 | (void) fprintf(stderr, "A FILE must be specified.\n"); | |
972 | return (1); | |
973 | } | |
974 | ||
975 | printf("Verifying permutation maps: '%s'\n", filename); | |
976 | ||
977 | /* | |
978 | * Lookup hardcoded permutation map for each valid number of children | |
979 | * and verify a generated map has the correct checksum. Then compare | |
980 | * the generated map values with the nvlist map values read from the | |
981 | * reference file to cross-check the permutation. | |
982 | */ | |
983 | for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; | |
984 | children <= VDEV_DRAID_MAX_CHILDREN; | |
985 | children++) { | |
986 | draid_map_t *map; | |
861166b0 | 987 | char key[8] = {0}; |
b2255edc | 988 | |
b2255edc BB |
989 | snprintf(key, 8, "%llu", (u_longlong_t)children); |
990 | ||
991 | error = alloc_fixed_map(children, &map); | |
992 | if (error) { | |
993 | printf("Error alloc_fixed_map() failed: %s\n", | |
994 | error == ECKSUM ? "Invalid checksum" : | |
995 | strerror(error)); | |
996 | return (1); | |
997 | } | |
998 | ||
999 | uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; | |
1000 | uint8_t *nv_perms; | |
1001 | nvlist_t *cfg; | |
1002 | uint_t c; | |
1003 | ||
1004 | error = read_map_key(filename, key, &cfg); | |
1005 | if (error != 0) { | |
1006 | printf("Error read_map_key() failed: %s\n", | |
1007 | strerror(error)); | |
1008 | free_map(map); | |
1009 | return (1); | |
1010 | } | |
1011 | ||
1012 | nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); | |
1013 | nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); | |
1014 | nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); | |
1015 | nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); | |
1016 | nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); | |
1017 | ||
1018 | /* | |
1019 | * Compare draid_map_t and nvlist reference values. | |
1020 | */ | |
1021 | if (map->dm_seed != nv_seed) { | |
1022 | printf("Error different seeds: 0x%016llx != " | |
1023 | "0x%016llx\n", (u_longlong_t)map->dm_seed, | |
1024 | (u_longlong_t)nv_seed); | |
1025 | error = EINVAL; | |
1026 | } | |
1027 | ||
1028 | if (map->dm_checksum != nv_checksum) { | |
1029 | printf("Error different checksums: 0x%016llx " | |
1030 | "!= 0x%016llx\n", | |
1031 | (u_longlong_t)map->dm_checksum, | |
1032 | (u_longlong_t)nv_checksum); | |
1033 | error = EINVAL; | |
1034 | } | |
1035 | ||
1036 | if (map->dm_children != nv_children) { | |
1037 | printf("Error different children: %llu " | |
1038 | "!= %llu\n", (u_longlong_t)map->dm_children, | |
1039 | (u_longlong_t)nv_children); | |
1040 | error = EINVAL; | |
1041 | } | |
1042 | ||
1043 | if (map->dm_nperms != nv_nperms) { | |
1044 | printf("Error different nperms: %llu " | |
1045 | "!= %llu\n", (u_longlong_t)map->dm_nperms, | |
1046 | (u_longlong_t)nv_nperms); | |
1047 | error = EINVAL; | |
1048 | } | |
1049 | ||
1050 | for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { | |
1051 | if (map->dm_perms[i] != nv_perms[i]) { | |
1052 | printf("Error different perms[%llu]: " | |
1053 | "%d != %d\n", (u_longlong_t)i, | |
1054 | (int)map->dm_perms[i], | |
1055 | (int)nv_perms[i]); | |
1056 | error = EINVAL; | |
1057 | break; | |
1058 | } | |
1059 | } | |
1060 | ||
1061 | /* | |
1062 | * For good measure recalculate the worst and average | |
1063 | * ratios and confirm they match the nvlist values. | |
1064 | */ | |
1065 | if (check_ratios) { | |
1066 | uint64_t nv_worst_ratio, nv_avg_ratio; | |
1067 | double worst_ratio, avg_ratio; | |
1068 | ||
1069 | eval_decluster(map, &worst_ratio, &avg_ratio); | |
1070 | ||
1071 | nv_worst_ratio = fnvlist_lookup_uint64(cfg, | |
1072 | MAP_WORST_RATIO); | |
1073 | nv_avg_ratio = fnvlist_lookup_uint64(cfg, | |
1074 | MAP_AVG_RATIO); | |
1075 | ||
1076 | if (worst_ratio < 1.0 || avg_ratio < 1.0) { | |
1077 | printf("Error ratio out of range %2.03f, " | |
1078 | "%2.03f\n", worst_ratio, avg_ratio); | |
1079 | error = EINVAL; | |
1080 | } | |
1081 | ||
1082 | if ((uint64_t)(worst_ratio * 1000.0) != | |
1083 | nv_worst_ratio) { | |
1084 | printf("Error different worst_ratio %2.03f " | |
1085 | "!= %2.03f\n", (double)nv_worst_ratio / | |
1086 | 1000.0, worst_ratio); | |
1087 | error = EINVAL; | |
1088 | } | |
1089 | ||
1090 | if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { | |
1091 | printf("Error different average_ratio %2.03f " | |
1092 | "!= %2.03f\n", (double)nv_avg_ratio / | |
1093 | 1000.0, avg_ratio); | |
1094 | error = EINVAL; | |
1095 | } | |
1096 | } | |
1097 | ||
1098 | if (error) { | |
1099 | free_map(map); | |
1100 | nvlist_free(cfg); | |
1101 | return (1); | |
1102 | } | |
1103 | ||
1104 | if (verbose > 0) { | |
1105 | printf("- %llu children: good\n", | |
1106 | (u_longlong_t)children); | |
1107 | } | |
1108 | n++; | |
1109 | ||
1110 | free_map(map); | |
1111 | nvlist_free(cfg); | |
1112 | } | |
1113 | ||
1114 | if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { | |
1115 | printf("Error permutation maps missing: %d / %d checked\n", | |
1116 | n, VDEV_DRAID_MAX_CHILDREN - 1); | |
1117 | return (1); | |
1118 | } | |
1119 | ||
1120 | printf("Successfully verified %d / %d permutation maps\n", | |
1121 | n, VDEV_DRAID_MAX_CHILDREN - 1); | |
1122 | ||
1123 | return (0); | |
1124 | } | |
1125 | ||
1126 | /* | |
1127 | * Dump the contents of the specified mapping(s) for inspection. | |
1128 | */ | |
1129 | static int | |
1130 | draid_dump(int argc, char *argv[]) | |
1131 | { | |
861166b0 | 1132 | char filename[MAXPATHLEN] = {0}; |
b2255edc BB |
1133 | int c, error, verbose = 1; |
1134 | int min_children = VDEV_DRAID_MIN_CHILDREN; | |
1135 | int max_children = VDEV_DRAID_MAX_CHILDREN; | |
1136 | ||
1137 | while ((c = getopt(argc, argv, ":vm:n:")) != -1) { | |
1138 | switch (c) { | |
1139 | case 'm': | |
1140 | min_children = (int)strtol(optarg, NULL, 0); | |
1141 | if (min_children < 2) { | |
1142 | (void) fprintf(stderr, "A minimum of 2 " | |
1143 | "children are required.\n"); | |
1144 | return (1); | |
1145 | } | |
1146 | ||
1147 | break; | |
1148 | case 'n': | |
1149 | max_children = (int)strtol(optarg, NULL, 0); | |
1150 | if (max_children > VDEV_DRAID_MAX_CHILDREN) { | |
1151 | (void) fprintf(stderr, "A maximum of %d " | |
1152 | "children are allowed.\n", | |
1153 | VDEV_DRAID_MAX_CHILDREN); | |
1154 | return (1); | |
1155 | } | |
1156 | break; | |
1157 | case 'v': | |
1158 | verbose++; | |
1159 | break; | |
1160 | case ':': | |
1161 | (void) fprintf(stderr, | |
1162 | "missing argument for '%c' option\n", optopt); | |
1163 | draid_usage(); | |
1164 | break; | |
1165 | case '?': | |
1166 | (void) fprintf(stderr, "invalid option '%c'\n", | |
1167 | optopt); | |
1168 | draid_usage(); | |
1169 | break; | |
1170 | } | |
1171 | } | |
1172 | ||
861166b0 | 1173 | if (argc > optind) |
7584fbe8 | 1174 | strlcpy(filename, argv[optind], sizeof (filename)); |
861166b0 | 1175 | else { |
b2255edc BB |
1176 | (void) fprintf(stderr, "A FILE must be specified.\n"); |
1177 | return (1); | |
1178 | } | |
1179 | ||
1180 | /* | |
1181 | * Dump maps for the requested child counts. | |
1182 | */ | |
1183 | for (uint64_t children = min_children; | |
1184 | children <= max_children; children++) { | |
1185 | char key[8] = { 0 }; | |
1186 | ||
1187 | snprintf(key, 7, "%llu", (u_longlong_t)children); | |
1188 | error = dump_map_key(filename, key, verbose); | |
1189 | if (error) { | |
1190 | printf("Error dump_map_key(): %s\n", strerror(error)); | |
1191 | return (1); | |
1192 | } | |
1193 | } | |
1194 | ||
1195 | return (0); | |
1196 | } | |
1197 | ||
1198 | /* | |
bf169e9f AG |
1199 | * Print all of the mappings as a C formatted draid_map_t array. This table |
1200 | * is found in the module/zcommon/zfs_draid.c file and is the definitive | |
b2255edc BB |
1201 | * source for all mapping used by dRAID. It cannot be updated without |
1202 | * changing the dRAID on disk format. | |
1203 | */ | |
1204 | static int | |
1205 | draid_table(int argc, char *argv[]) | |
1206 | { | |
861166b0 | 1207 | char filename[MAXPATHLEN] = {0}; |
b2255edc BB |
1208 | int error; |
1209 | ||
861166b0 | 1210 | if (argc > optind) |
7584fbe8 | 1211 | strlcpy(filename, argv[optind], sizeof (filename)); |
861166b0 | 1212 | else { |
b2255edc BB |
1213 | (void) fprintf(stderr, "A FILE must be specified.\n"); |
1214 | return (1); | |
1215 | } | |
1216 | ||
1217 | printf("static const draid_map_t " | |
1218 | "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); | |
1219 | ||
1220 | for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; | |
1221 | children <= VDEV_DRAID_MAX_CHILDREN; | |
1222 | children++) { | |
1223 | uint64_t seed, checksum, nperms, avg_ratio; | |
1224 | nvlist_t *cfg; | |
861166b0 | 1225 | char key[8] = {0}; |
b2255edc | 1226 | |
b2255edc BB |
1227 | snprintf(key, 8, "%llu", (u_longlong_t)children); |
1228 | ||
1229 | error = read_map_key(filename, key, &cfg); | |
1230 | if (error != 0) { | |
1231 | printf("Error read_map_key() failed: %s\n", | |
1232 | strerror(error)); | |
1233 | return (1); | |
1234 | } | |
1235 | ||
1236 | seed = fnvlist_lookup_uint64(cfg, MAP_SEED); | |
1237 | checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); | |
1238 | children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); | |
1239 | nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); | |
1240 | avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); | |
1241 | ||
1242 | printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" | |
1243 | "/* %2.03f */\n", (u_longlong_t)children, | |
1244 | (u_longlong_t)nperms, (u_longlong_t)seed, | |
1245 | (u_longlong_t)checksum, (double)avg_ratio / 1000.0); | |
1246 | ||
1247 | nvlist_free(cfg); | |
1248 | } | |
1249 | ||
1250 | printf("};\n"); | |
1251 | ||
1252 | return (0); | |
1253 | } | |
1254 | ||
1255 | static int | |
1256 | draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) | |
1257 | { | |
1258 | nvlist_t *srccfgs; | |
1259 | nvpair_t *elem = NULL; | |
1260 | int error, merged = 0; | |
1261 | ||
1262 | error = read_map(srcfilename, &srccfgs); | |
1263 | if (error != 0) | |
1264 | return (error); | |
1265 | ||
1266 | while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { | |
1267 | uint64_t nv_worst_ratio; | |
1268 | uint64_t allcfg_worst_ratio; | |
1269 | nvlist_t *cfg, *allcfg; | |
d1807f16 | 1270 | const char *key; |
b2255edc BB |
1271 | |
1272 | switch (nvpair_type(elem)) { | |
1273 | case DATA_TYPE_NVLIST: | |
1274 | ||
1275 | (void) nvpair_value_nvlist(elem, &cfg); | |
1276 | key = nvpair_name(elem); | |
1277 | ||
1278 | nv_worst_ratio = fnvlist_lookup_uint64(cfg, | |
1279 | MAP_WORST_RATIO); | |
1280 | ||
1281 | error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); | |
1282 | if (error == 0) { | |
1283 | allcfg_worst_ratio = fnvlist_lookup_uint64( | |
1284 | allcfg, MAP_WORST_RATIO); | |
1285 | ||
1286 | if (nv_worst_ratio < allcfg_worst_ratio) { | |
1287 | fnvlist_remove(allcfgs, key); | |
6a42939f | 1288 | fnvlist_add_nvlist(allcfgs, key, cfg); |
b2255edc BB |
1289 | merged++; |
1290 | } | |
1291 | } else if (error == ENOENT) { | |
6a42939f | 1292 | fnvlist_add_nvlist(allcfgs, key, cfg); |
b2255edc BB |
1293 | merged++; |
1294 | } else { | |
1295 | return (error); | |
1296 | } | |
1297 | ||
1298 | break; | |
1299 | default: | |
1300 | continue; | |
1301 | } | |
1302 | } | |
1303 | ||
1304 | nvlist_free(srccfgs); | |
1305 | ||
1306 | *mergedp = merged; | |
1307 | ||
1308 | return (0); | |
1309 | } | |
1310 | ||
1311 | /* | |
1312 | * Merge the best map for each child count found in the listed files into | |
1313 | * a new file. This allows 'draid generate' to be run in parallel and for | |
1314 | * the results maps to be combined. | |
1315 | */ | |
1316 | static int | |
1317 | draid_merge(int argc, char *argv[]) | |
1318 | { | |
861166b0 | 1319 | char filename[MAXPATHLEN] = {0}; |
63cb3413 | 1320 | int c, error, total_merged = 0; |
b2255edc BB |
1321 | nvlist_t *allcfgs; |
1322 | ||
63cb3413 | 1323 | while ((c = getopt(argc, argv, ":")) != -1) { |
b2255edc | 1324 | switch (c) { |
b2255edc BB |
1325 | case ':': |
1326 | (void) fprintf(stderr, | |
1327 | "missing argument for '%c' option\n", optopt); | |
1328 | draid_usage(); | |
1329 | break; | |
1330 | case '?': | |
1331 | (void) fprintf(stderr, "invalid option '%c'\n", | |
1332 | optopt); | |
1333 | draid_usage(); | |
1334 | break; | |
1335 | } | |
1336 | } | |
1337 | ||
1338 | if (argc < 4) { | |
1339 | (void) fprintf(stderr, | |
1340 | "A FILE and multiple SRCs must be specified.\n"); | |
1341 | return (1); | |
1342 | } | |
1343 | ||
7584fbe8 | 1344 | strlcpy(filename, argv[optind], sizeof (filename)); |
b2255edc BB |
1345 | optind++; |
1346 | ||
1347 | error = read_map(filename, &allcfgs); | |
1348 | if (error == ENOENT) { | |
1349 | allcfgs = fnvlist_alloc(); | |
1350 | } else if (error != 0) { | |
1351 | printf("Error read_map(): %s\n", strerror(error)); | |
1352 | return (error); | |
1353 | } | |
1354 | ||
1355 | while (optind < argc) { | |
861166b0 | 1356 | char srcfilename[MAXPATHLEN] = {0}; |
b2255edc BB |
1357 | int merged = 0; |
1358 | ||
7584fbe8 | 1359 | strlcpy(srcfilename, argv[optind], sizeof (srcfilename)); |
b2255edc BB |
1360 | |
1361 | error = draid_merge_impl(allcfgs, srcfilename, &merged); | |
1362 | if (error) { | |
1363 | printf("Error draid_merge_impl(): %s\n", | |
1364 | strerror(error)); | |
1365 | nvlist_free(allcfgs); | |
1366 | return (1); | |
1367 | } | |
1368 | ||
1369 | total_merged += merged; | |
1370 | printf("Merged %d key(s) from '%s' into '%s'\n", merged, | |
1371 | srcfilename, filename); | |
1372 | ||
1373 | optind++; | |
1374 | } | |
1375 | ||
1376 | if (total_merged > 0) | |
1377 | write_map(filename, allcfgs); | |
1378 | ||
1379 | printf("Merged a total of %d key(s) into '%s'\n", total_merged, | |
1380 | filename); | |
1381 | ||
1382 | nvlist_free(allcfgs); | |
1383 | ||
1384 | return (0); | |
1385 | } | |
1386 | ||
1387 | int | |
1388 | main(int argc, char *argv[]) | |
1389 | { | |
1390 | if (argc < 2) | |
1391 | draid_usage(); | |
1392 | ||
1393 | char *subcommand = argv[1]; | |
1394 | ||
1395 | if (strcmp(subcommand, "generate") == 0) { | |
1396 | return (draid_generate(argc - 1, argv + 1)); | |
1397 | } else if (strcmp(subcommand, "verify") == 0) { | |
1398 | return (draid_verify(argc - 1, argv + 1)); | |
1399 | } else if (strcmp(subcommand, "dump") == 0) { | |
1400 | return (draid_dump(argc - 1, argv + 1)); | |
1401 | } else if (strcmp(subcommand, "table") == 0) { | |
1402 | return (draid_table(argc - 1, argv + 1)); | |
1403 | } else if (strcmp(subcommand, "merge") == 0) { | |
1404 | return (draid_merge(argc - 1, argv + 1)); | |
1405 | } else { | |
1406 | draid_usage(); | |
1407 | } | |
1408 | } |