]>
Commit | Line | Data |
---|---|---|
ab9f4b0b GN |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (C) 2016 Gvozden Nešković. All rights reserved. | |
24 | */ | |
25 | ||
26 | #include <sys/zfs_context.h> | |
27 | #include <sys/time.h> | |
28 | #include <sys/wait.h> | |
29 | #include <sys/zio.h> | |
30 | #include <umem.h> | |
31 | #include <sys/vdev_raidz.h> | |
32 | #include <sys/vdev_raidz_impl.h> | |
33 | #include <assert.h> | |
34 | #include <stdio.h> | |
35 | #include "raidz_test.h" | |
36 | ||
37 | static int *rand_data; | |
38 | raidz_test_opts_t rto_opts; | |
39 | ||
e7238382 | 40 | static char pid_s[16]; |
ab9f4b0b GN |
41 | |
42 | static void sig_handler(int signo) | |
43 | { | |
e7238382 | 44 | int old_errno = errno; |
ab9f4b0b GN |
45 | struct sigaction action; |
46 | /* | |
47 | * Restore default action and re-raise signal so SIGSEGV and | |
48 | * SIGABRT can trigger a core dump. | |
49 | */ | |
50 | action.sa_handler = SIG_DFL; | |
51 | sigemptyset(&action.sa_mask); | |
52 | action.sa_flags = 0; | |
53 | (void) sigaction(signo, &action, NULL); | |
54 | ||
e7238382 AZ |
55 | if (rto_opts.rto_gdb) { |
56 | pid_t pid = fork(); | |
57 | if (pid == 0) { | |
58 | execlp("gdb", "gdb", "-ex", "set pagination 0", | |
59 | "-p", pid_s, NULL); | |
60 | _exit(-1); | |
61 | } else if (pid > 0) | |
62 | while (waitpid(pid, NULL, 0) == -1 && errno == EINTR) | |
63 | ; | |
64 | } | |
ab9f4b0b GN |
65 | |
66 | raise(signo); | |
e7238382 | 67 | errno = old_errno; |
ab9f4b0b GN |
68 | } |
69 | ||
70 | static void print_opts(raidz_test_opts_t *opts, boolean_t force) | |
71 | { | |
72 | char *verbose; | |
73 | switch (opts->rto_v) { | |
b7c42ce5 | 74 | case D_ALL: |
ab9f4b0b GN |
75 | verbose = "no"; |
76 | break; | |
b7c42ce5 | 77 | case D_INFO: |
ab9f4b0b GN |
78 | verbose = "info"; |
79 | break; | |
b7c42ce5 | 80 | case D_DEBUG: |
ab9f4b0b GN |
81 | default: |
82 | verbose = "debug"; | |
83 | break; | |
84 | } | |
85 | ||
86 | if (force || opts->rto_v >= D_INFO) { | |
87 | (void) fprintf(stdout, DBLSEP "Running with options:\n" | |
88 | " (-a) zio ashift : %zu\n" | |
89 | " (-o) zio offset : 1 << %zu\n" | |
b2255edc BB |
90 | " (-e) expanded map : %s\n" |
91 | " (-r) reflow offset : %llx\n" | |
ab9f4b0b GN |
92 | " (-d) number of raidz data columns : %zu\n" |
93 | " (-s) size of DATA : 1 << %zu\n" | |
94 | " (-S) sweep parameters : %s \n" | |
95 | " (-v) verbose : %s \n\n", | |
b2255edc BB |
96 | opts->rto_ashift, /* -a */ |
97 | ilog2(opts->rto_offset), /* -o */ | |
98 | opts->rto_expand ? "yes" : "no", /* -e */ | |
99 | (u_longlong_t)opts->rto_expand_offset, /* -r */ | |
100 | opts->rto_dcols, /* -d */ | |
101 | ilog2(opts->rto_dsize), /* -s */ | |
102 | opts->rto_sweep ? "yes" : "no", /* -S */ | |
103 | verbose); /* -v */ | |
ab9f4b0b GN |
104 | } |
105 | } | |
106 | ||
107 | static void usage(boolean_t requested) | |
108 | { | |
109 | const raidz_test_opts_t *o = &rto_opts_defaults; | |
110 | ||
111 | FILE *fp = requested ? stdout : stderr; | |
112 | ||
113 | (void) fprintf(fp, "Usage:\n" | |
02730c33 BB |
114 | "\t[-a zio ashift (default: %zu)]\n" |
115 | "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" | |
116 | "\t[-d number of raidz data columns (default: %zu)]\n" | |
117 | "\t[-s zio size, exponent radix 2 (default: %zu)]\n" | |
118 | "\t[-S parameter sweep (default: %s)]\n" | |
119 | "\t[-t timeout for parameter sweep test]\n" | |
120 | "\t[-B benchmark all raidz implementations]\n" | |
b2255edc BB |
121 | "\t[-e use expanded raidz map (default: %s)]\n" |
122 | "\t[-r expanded raidz map reflow offset (default: %llx)]\n" | |
b7c42ce5 | 123 | "\t[-v increase verbosity (default: %d)]\n" |
02730c33 BB |
124 | "\t[-h (print help)]\n" |
125 | "\t[-T test the test, see if failure would be detected]\n" | |
126 | "\t[-D debug (attach gdb on SIGSEGV)]\n" | |
127 | "", | |
128 | o->rto_ashift, /* -a */ | |
129 | ilog2(o->rto_offset), /* -o */ | |
130 | o->rto_dcols, /* -d */ | |
131 | ilog2(o->rto_dsize), /* -s */ | |
132 | rto_opts.rto_sweep ? "yes" : "no", /* -S */ | |
b2255edc BB |
133 | rto_opts.rto_expand ? "yes" : "no", /* -e */ |
134 | (u_longlong_t)o->rto_expand_offset, /* -r */ | |
b7c42ce5 | 135 | o->rto_v); /* -v */ |
ab9f4b0b GN |
136 | |
137 | exit(requested ? 0 : 1); | |
138 | } | |
139 | ||
140 | static void process_options(int argc, char **argv) | |
141 | { | |
142 | size_t value; | |
143 | int opt; | |
ab9f4b0b GN |
144 | raidz_test_opts_t *o = &rto_opts; |
145 | ||
861166b0 | 146 | memcpy(o, &rto_opts_defaults, sizeof (*o)); |
ab9f4b0b | 147 | |
b2255edc | 148 | while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { |
ab9f4b0b GN |
149 | value = 0; |
150 | ||
151 | switch (opt) { | |
152 | case 'a': | |
153 | value = strtoull(optarg, NULL, 0); | |
154 | o->rto_ashift = MIN(13, MAX(9, value)); | |
155 | break; | |
b2255edc BB |
156 | case 'e': |
157 | o->rto_expand = 1; | |
158 | break; | |
159 | case 'r': | |
160 | o->rto_expand_offset = strtoull(optarg, NULL, 0); | |
161 | break; | |
ab9f4b0b GN |
162 | case 'o': |
163 | value = strtoull(optarg, NULL, 0); | |
164 | o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; | |
165 | break; | |
166 | case 'd': | |
167 | value = strtoull(optarg, NULL, 0); | |
168 | o->rto_dcols = MIN(255, MAX(1, value)); | |
169 | break; | |
170 | case 's': | |
171 | value = strtoull(optarg, NULL, 0); | |
172 | o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, | |
173 | MAX(SPA_MINBLOCKSHIFT, value)); | |
174 | break; | |
175 | case 't': | |
176 | value = strtoull(optarg, NULL, 0); | |
177 | o->rto_sweep_timeout = value; | |
178 | break; | |
179 | case 'v': | |
180 | o->rto_v++; | |
181 | break; | |
182 | case 'S': | |
183 | o->rto_sweep = 1; | |
184 | break; | |
185 | case 'B': | |
186 | o->rto_benchmark = 1; | |
187 | break; | |
188 | case 'D': | |
189 | o->rto_gdb = 1; | |
190 | break; | |
191 | case 'T': | |
192 | o->rto_sanity = 1; | |
193 | break; | |
194 | case 'h': | |
195 | usage(B_TRUE); | |
196 | break; | |
197 | case '?': | |
198 | default: | |
199 | usage(B_FALSE); | |
200 | break; | |
201 | } | |
202 | } | |
203 | } | |
204 | ||
b2255edc BB |
205 | #define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) |
206 | #define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) | |
ab9f4b0b | 207 | |
b2255edc BB |
208 | #define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) |
209 | #define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) | |
ab9f4b0b GN |
210 | |
211 | static int | |
212 | cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) | |
213 | { | |
b2255edc | 214 | int r, i, ret = 0; |
ab9f4b0b GN |
215 | |
216 | VERIFY(parity >= 1 && parity <= 3); | |
217 | ||
b2255edc BB |
218 | for (r = 0; r < rm->rm_nrows; r++) { |
219 | raidz_row_t * const rr = rm->rm_row[r]; | |
220 | raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; | |
221 | for (i = 0; i < parity; i++) { | |
222 | if (CODE_COL_SIZE(rrg, i) == 0) { | |
223 | VERIFY0(CODE_COL_SIZE(rr, i)); | |
224 | continue; | |
225 | } | |
226 | ||
227 | if (abd_cmp(CODE_COL(rr, i), | |
228 | CODE_COL(rrg, i)) != 0) { | |
229 | ret++; | |
230 | LOG_OPT(D_DEBUG, opts, | |
231 | "\nParity block [%d] different!\n", i); | |
232 | } | |
ab9f4b0b GN |
233 | } |
234 | } | |
235 | return (ret); | |
236 | } | |
237 | ||
238 | static int | |
239 | cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) | |
240 | { | |
b2255edc BB |
241 | int r, i, dcols, ret = 0; |
242 | ||
243 | for (r = 0; r < rm->rm_nrows; r++) { | |
244 | raidz_row_t *rr = rm->rm_row[r]; | |
245 | raidz_row_t *rrg = opts->rm_golden->rm_row[r]; | |
246 | dcols = opts->rm_golden->rm_row[0]->rr_cols - | |
247 | raidz_parity(opts->rm_golden); | |
248 | for (i = 0; i < dcols; i++) { | |
249 | if (DATA_COL_SIZE(rrg, i) == 0) { | |
250 | VERIFY0(DATA_COL_SIZE(rr, i)); | |
251 | continue; | |
252 | } | |
ab9f4b0b | 253 | |
b2255edc BB |
254 | if (abd_cmp(DATA_COL(rrg, i), |
255 | DATA_COL(rr, i)) != 0) { | |
256 | ret++; | |
ab9f4b0b | 257 | |
b2255edc BB |
258 | LOG_OPT(D_DEBUG, opts, |
259 | "\nData block [%d] different!\n", i); | |
260 | } | |
ab9f4b0b GN |
261 | } |
262 | } | |
263 | return (ret); | |
264 | } | |
265 | ||
cbf484f8 GN |
266 | static int |
267 | init_rand(void *data, size_t size, void *private) | |
268 | { | |
876b60dc AZ |
269 | (void) private; |
270 | memcpy(data, rand_data, size); | |
cbf484f8 GN |
271 | return (0); |
272 | } | |
273 | ||
ab9f4b0b GN |
274 | static void |
275 | corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) | |
276 | { | |
b2255edc BB |
277 | for (int r = 0; r < rm->rm_nrows; r++) { |
278 | raidz_row_t *rr = rm->rm_row[r]; | |
279 | for (int i = 0; i < cnt; i++) { | |
280 | raidz_col_t *col = &rr->rr_col[tgts[i]]; | |
281 | abd_iterate_func(col->rc_abd, 0, col->rc_size, | |
282 | init_rand, NULL); | |
283 | } | |
ab9f4b0b GN |
284 | } |
285 | } | |
286 | ||
287 | void | |
cbf484f8 | 288 | init_zio_abd(zio_t *zio) |
ab9f4b0b | 289 | { |
cbf484f8 | 290 | abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); |
ab9f4b0b GN |
291 | } |
292 | ||
293 | static void | |
294 | fini_raidz_map(zio_t **zio, raidz_map_t **rm) | |
295 | { | |
296 | vdev_raidz_map_free(*rm); | |
cbf484f8 | 297 | raidz_free((*zio)->io_abd, (*zio)->io_size); |
ab9f4b0b GN |
298 | umem_free(*zio, sizeof (zio_t)); |
299 | ||
300 | *zio = NULL; | |
301 | *rm = NULL; | |
302 | } | |
303 | ||
304 | static int | |
305 | init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) | |
306 | { | |
307 | int err = 0; | |
308 | zio_t *zio_test; | |
309 | raidz_map_t *rm_test; | |
310 | const size_t total_ncols = opts->rto_dcols + parity; | |
311 | ||
312 | if (opts->rm_golden) { | |
313 | fini_raidz_map(&opts->zio_golden, &opts->rm_golden); | |
314 | } | |
315 | ||
316 | opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); | |
317 | zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); | |
318 | ||
319 | opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; | |
320 | opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; | |
321 | ||
cbf484f8 GN |
322 | opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); |
323 | zio_test->io_abd = raidz_alloc(opts->rto_dsize); | |
ab9f4b0b | 324 | |
cbf484f8 GN |
325 | init_zio_abd(opts->zio_golden); |
326 | init_zio_abd(zio_test); | |
ab9f4b0b GN |
327 | |
328 | VERIFY0(vdev_raidz_impl_set("original")); | |
329 | ||
b2255edc BB |
330 | if (opts->rto_expand) { |
331 | opts->rm_golden = | |
332 | vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, | |
333 | opts->zio_golden->io_size, opts->zio_golden->io_offset, | |
334 | opts->rto_ashift, total_ncols+1, total_ncols, | |
335 | parity, opts->rto_expand_offset); | |
336 | rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, | |
337 | zio_test->io_size, zio_test->io_offset, | |
338 | opts->rto_ashift, total_ncols+1, total_ncols, | |
339 | parity, opts->rto_expand_offset); | |
340 | } else { | |
341 | opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, | |
342 | opts->rto_ashift, total_ncols, parity); | |
343 | rm_test = vdev_raidz_map_alloc(zio_test, | |
344 | opts->rto_ashift, total_ncols, parity); | |
345 | } | |
ab9f4b0b GN |
346 | |
347 | VERIFY(opts->zio_golden); | |
348 | VERIFY(opts->rm_golden); | |
349 | ||
350 | vdev_raidz_generate_parity(opts->rm_golden); | |
351 | vdev_raidz_generate_parity(rm_test); | |
352 | ||
353 | /* sanity check */ | |
354 | err |= cmp_data(opts, rm_test); | |
355 | err |= cmp_code(opts, rm_test, parity); | |
356 | ||
357 | if (err) | |
358 | ERR("initializing the golden copy ... [FAIL]!\n"); | |
359 | ||
360 | /* tear down raidz_map of test zio */ | |
361 | fini_raidz_map(&zio_test, &rm_test); | |
362 | ||
363 | return (err); | |
364 | } | |
365 | ||
b2255edc BB |
366 | /* |
367 | * If reflow is not in progress, reflow_offset should be UINT64_MAX. | |
368 | * For each row, if the row is entirely before reflow_offset, it will | |
369 | * come from the new location. Otherwise this row will come from the | |
370 | * old location. Therefore, rows that straddle the reflow_offset will | |
371 | * come from the old location. | |
372 | * | |
373 | * NOTE: Until raidz expansion is implemented this function is only | |
374 | * needed by raidz_test.c to the multi-row raid_map_t functionality. | |
375 | */ | |
376 | raidz_map_t * | |
377 | vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, | |
378 | uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, | |
379 | uint64_t nparity, uint64_t reflow_offset) | |
380 | { | |
381 | /* The zio's size in units of the vdev's minimum sector size. */ | |
382 | uint64_t s = size >> ashift; | |
383 | uint64_t q, r, bc, devidx, asize = 0, tot; | |
384 | ||
385 | /* | |
386 | * "Quotient": The number of data sectors for this stripe on all but | |
387 | * the "big column" child vdevs that also contain "remainder" data. | |
388 | * AKA "full rows" | |
389 | */ | |
390 | q = s / (logical_cols - nparity); | |
391 | ||
392 | /* | |
393 | * "Remainder": The number of partial stripe data sectors in this I/O. | |
394 | * This will add a sector to some, but not all, child vdevs. | |
395 | */ | |
396 | r = s - q * (logical_cols - nparity); | |
397 | ||
398 | /* The number of "big columns" - those which contain remainder data. */ | |
399 | bc = (r == 0 ? 0 : r + nparity); | |
400 | ||
401 | /* | |
402 | * The total number of data and parity sectors associated with | |
403 | * this I/O. | |
404 | */ | |
405 | tot = s + nparity * (q + (r == 0 ? 0 : 1)); | |
406 | ||
407 | /* How many rows contain data (not skip) */ | |
408 | uint64_t rows = howmany(tot, logical_cols); | |
409 | int cols = MIN(tot, logical_cols); | |
410 | ||
411 | raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), | |
412 | KM_SLEEP); | |
413 | rm->rm_nrows = rows; | |
414 | ||
415 | for (uint64_t row = 0; row < rows; row++) { | |
416 | raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, | |
417 | rr_col[cols]), KM_SLEEP); | |
418 | rm->rm_row[row] = rr; | |
419 | ||
420 | /* The starting RAIDZ (parent) vdev sector of the row. */ | |
421 | uint64_t b = (offset >> ashift) + row * logical_cols; | |
422 | ||
423 | /* | |
424 | * If we are in the middle of a reflow, and any part of this | |
425 | * row has not been copied, then use the old location of | |
426 | * this row. | |
427 | */ | |
428 | int row_phys_cols = physical_cols; | |
429 | if (b + (logical_cols - nparity) > reflow_offset >> ashift) | |
430 | row_phys_cols--; | |
431 | ||
432 | /* starting child of this row */ | |
433 | uint64_t child_id = b % row_phys_cols; | |
434 | /* The starting byte offset on each child vdev. */ | |
435 | uint64_t child_offset = (b / row_phys_cols) << ashift; | |
436 | ||
437 | /* | |
438 | * We set cols to the entire width of the block, even | |
439 | * if this row is shorter. This is needed because parity | |
440 | * generation (for Q and R) needs to know the entire width, | |
441 | * because it treats the short row as though it was | |
442 | * full-width (and the "phantom" sectors were zero-filled). | |
443 | * | |
444 | * Another approach to this would be to set cols shorter | |
445 | * (to just the number of columns that we might do i/o to) | |
446 | * and have another mechanism to tell the parity generation | |
447 | * about the "entire width". Reconstruction (at least | |
448 | * vdev_raidz_reconstruct_general()) would also need to | |
449 | * know about the "entire width". | |
450 | */ | |
451 | rr->rr_cols = cols; | |
452 | rr->rr_bigcols = bc; | |
453 | rr->rr_missingdata = 0; | |
454 | rr->rr_missingparity = 0; | |
455 | rr->rr_firstdatacol = nparity; | |
b2255edc BB |
456 | rr->rr_abd_empty = NULL; |
457 | rr->rr_nempty = 0; | |
458 | ||
459 | for (int c = 0; c < rr->rr_cols; c++, child_id++) { | |
460 | if (child_id >= row_phys_cols) { | |
461 | child_id -= row_phys_cols; | |
462 | child_offset += 1ULL << ashift; | |
463 | } | |
464 | rr->rr_col[c].rc_devidx = child_id; | |
465 | rr->rr_col[c].rc_offset = child_offset; | |
b2255edc BB |
466 | rr->rr_col[c].rc_orig_data = NULL; |
467 | rr->rr_col[c].rc_error = 0; | |
468 | rr->rr_col[c].rc_tried = 0; | |
469 | rr->rr_col[c].rc_skipped = 0; | |
470 | rr->rr_col[c].rc_need_orig_restore = B_FALSE; | |
471 | ||
472 | uint64_t dc = c - rr->rr_firstdatacol; | |
473 | if (c < rr->rr_firstdatacol) { | |
474 | rr->rr_col[c].rc_size = 1ULL << ashift; | |
475 | rr->rr_col[c].rc_abd = | |
476 | abd_alloc_linear(rr->rr_col[c].rc_size, | |
477 | B_TRUE); | |
478 | } else if (row == rows - 1 && bc != 0 && c >= bc) { | |
479 | /* | |
480 | * Past the end, this for parity generation. | |
481 | */ | |
482 | rr->rr_col[c].rc_size = 0; | |
483 | rr->rr_col[c].rc_abd = NULL; | |
484 | } else { | |
485 | /* | |
486 | * "data column" (col excluding parity) | |
487 | * Add an ASCII art diagram here | |
488 | */ | |
489 | uint64_t off; | |
490 | ||
491 | if (c < bc || r == 0) { | |
492 | off = dc * rows + row; | |
493 | } else { | |
494 | off = r * rows + | |
495 | (dc - r) * (rows - 1) + row; | |
496 | } | |
497 | rr->rr_col[c].rc_size = 1ULL << ashift; | |
e2af2acc MA |
498 | rr->rr_col[c].rc_abd = abd_get_offset_struct( |
499 | &rr->rr_col[c].rc_abdstruct, | |
500 | abd, off << ashift, 1 << ashift); | |
b2255edc BB |
501 | } |
502 | ||
503 | asize += rr->rr_col[c].rc_size; | |
504 | } | |
505 | /* | |
506 | * If all data stored spans all columns, there's a danger that | |
507 | * parity will always be on the same device and, since parity | |
508 | * isn't read during normal operation, that that device's I/O | |
509 | * bandwidth won't be used effectively. We therefore switch | |
510 | * the parity every 1MB. | |
511 | * | |
512 | * ...at least that was, ostensibly, the theory. As a practical | |
513 | * matter unless we juggle the parity between all devices | |
514 | * evenly, we won't see any benefit. Further, occasional writes | |
515 | * that aren't a multiple of the LCM of the number of children | |
516 | * and the minimum stripe width are sufficient to avoid pessimal | |
517 | * behavior. Unfortunately, this decision created an implicit | |
518 | * on-disk format requirement that we need to support for all | |
519 | * eternity, but only for single-parity RAID-Z. | |
520 | * | |
521 | * If we intend to skip a sector in the zeroth column for | |
522 | * padding we must make sure to note this swap. We will never | |
523 | * intend to skip the first column since at least one data and | |
524 | * one parity column must appear in each row. | |
525 | */ | |
526 | if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && | |
527 | (offset & (1ULL << 20))) { | |
528 | ASSERT(rr->rr_cols >= 2); | |
529 | ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); | |
530 | devidx = rr->rr_col[0].rc_devidx; | |
531 | uint64_t o = rr->rr_col[0].rc_offset; | |
532 | rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; | |
533 | rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; | |
534 | rr->rr_col[1].rc_devidx = devidx; | |
535 | rr->rr_col[1].rc_offset = o; | |
536 | } | |
537 | ||
538 | } | |
539 | ASSERT3U(asize, ==, tot << ashift); | |
540 | ||
541 | /* init RAIDZ parity ops */ | |
542 | rm->rm_ops = vdev_raidz_math_get_ops(); | |
543 | ||
544 | return (rm); | |
545 | } | |
546 | ||
ab9f4b0b GN |
547 | static raidz_map_t * |
548 | init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) | |
549 | { | |
550 | raidz_map_t *rm = NULL; | |
551 | const size_t alloc_dsize = opts->rto_dsize; | |
552 | const size_t total_ncols = opts->rto_dcols + parity; | |
553 | const int ccols[] = { 0, 1, 2 }; | |
554 | ||
555 | VERIFY(zio); | |
556 | VERIFY(parity <= 3 && parity >= 1); | |
557 | ||
558 | *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); | |
559 | ||
560 | (*zio)->io_offset = 0; | |
561 | (*zio)->io_size = alloc_dsize; | |
cbf484f8 GN |
562 | (*zio)->io_abd = raidz_alloc(alloc_dsize); |
563 | init_zio_abd(*zio); | |
ab9f4b0b | 564 | |
b2255edc BB |
565 | if (opts->rto_expand) { |
566 | rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, | |
567 | (*zio)->io_size, (*zio)->io_offset, | |
568 | opts->rto_ashift, total_ncols+1, total_ncols, | |
569 | parity, opts->rto_expand_offset); | |
570 | } else { | |
571 | rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, | |
572 | total_ncols, parity); | |
573 | } | |
ab9f4b0b GN |
574 | VERIFY(rm); |
575 | ||
576 | /* Make sure code columns are destroyed */ | |
577 | corrupt_colums(rm, ccols, parity); | |
578 | ||
579 | return (rm); | |
580 | } | |
581 | ||
582 | static int | |
583 | run_gen_check(raidz_test_opts_t *opts) | |
584 | { | |
585 | char **impl_name; | |
586 | int fn, err = 0; | |
587 | zio_t *zio_test; | |
588 | raidz_map_t *rm_test; | |
589 | ||
590 | err = init_raidz_golden_map(opts, PARITY_PQR); | |
591 | if (0 != err) | |
592 | return (err); | |
593 | ||
594 | LOG(D_INFO, DBLSEP); | |
595 | LOG(D_INFO, "Testing parity generation...\n"); | |
596 | ||
597 | for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; | |
598 | impl_name++) { | |
599 | ||
600 | LOG(D_INFO, SEP); | |
601 | LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); | |
602 | ||
603 | if (0 != vdev_raidz_impl_set(*impl_name)) { | |
604 | LOG(D_INFO, "[SKIP]\n"); | |
605 | continue; | |
606 | } else { | |
607 | LOG(D_INFO, "[SUPPORTED]\n"); | |
608 | } | |
609 | ||
610 | for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { | |
611 | ||
292d573e GN |
612 | /* Check if should stop */ |
613 | if (rto_opts.rto_should_stop) | |
614 | return (err); | |
615 | ||
ab9f4b0b GN |
616 | /* create suitable raidz_map */ |
617 | rm_test = init_raidz_map(opts, &zio_test, fn+1); | |
618 | VERIFY(rm_test); | |
619 | ||
620 | LOG(D_INFO, "\t\tTesting method [%s] ...", | |
621 | raidz_gen_name[fn]); | |
622 | ||
623 | if (!opts->rto_sanity) | |
624 | vdev_raidz_generate_parity(rm_test); | |
625 | ||
626 | if (cmp_code(opts, rm_test, fn+1) != 0) { | |
627 | LOG(D_INFO, "[FAIL]\n"); | |
628 | err++; | |
629 | } else | |
630 | LOG(D_INFO, "[PASS]\n"); | |
631 | ||
632 | fini_raidz_map(&zio_test, &rm_test); | |
633 | } | |
634 | } | |
635 | ||
636 | fini_raidz_map(&opts->zio_golden, &opts->rm_golden); | |
637 | ||
638 | return (err); | |
639 | } | |
640 | ||
641 | static int | |
642 | run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) | |
643 | { | |
644 | int x0, x1, x2; | |
645 | int tgtidx[3]; | |
646 | int err = 0; | |
647 | static const int rec_tgts[7][3] = { | |
648 | {1, 2, 3}, /* rec_p: bad QR & D[0] */ | |
649 | {0, 2, 3}, /* rec_q: bad PR & D[0] */ | |
650 | {0, 1, 3}, /* rec_r: bad PQ & D[0] */ | |
651 | {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ | |
652 | {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ | |
653 | {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ | |
654 | {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ | |
655 | }; | |
656 | ||
657 | memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); | |
658 | ||
659 | if (fn < RAIDZ_REC_PQ) { | |
660 | /* can reconstruct 1 failed data disk */ | |
661 | for (x0 = 0; x0 < opts->rto_dcols; x0++) { | |
b2255edc | 662 | if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) |
ab9f4b0b GN |
663 | continue; |
664 | ||
292d573e GN |
665 | /* Check if should stop */ |
666 | if (rto_opts.rto_should_stop) | |
667 | return (err); | |
668 | ||
ab9f4b0b GN |
669 | LOG(D_DEBUG, "[%d] ", x0); |
670 | ||
671 | tgtidx[2] = x0 + raidz_parity(rm); | |
672 | ||
673 | corrupt_colums(rm, tgtidx+2, 1); | |
674 | ||
675 | if (!opts->rto_sanity) | |
676 | vdev_raidz_reconstruct(rm, tgtidx, 3); | |
677 | ||
678 | if (cmp_data(opts, rm) != 0) { | |
679 | err++; | |
680 | LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); | |
681 | } | |
682 | } | |
683 | ||
684 | } else if (fn < RAIDZ_REC_PQR) { | |
685 | /* can reconstruct 2 failed data disk */ | |
686 | for (x0 = 0; x0 < opts->rto_dcols; x0++) { | |
b2255edc | 687 | if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) |
ab9f4b0b GN |
688 | continue; |
689 | for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { | |
b2255edc BB |
690 | if (x1 >= rm->rm_row[0]->rr_cols - |
691 | raidz_parity(rm)) | |
ab9f4b0b GN |
692 | continue; |
693 | ||
292d573e GN |
694 | /* Check if should stop */ |
695 | if (rto_opts.rto_should_stop) | |
696 | return (err); | |
697 | ||
ab9f4b0b GN |
698 | LOG(D_DEBUG, "[%d %d] ", x0, x1); |
699 | ||
700 | tgtidx[1] = x0 + raidz_parity(rm); | |
701 | tgtidx[2] = x1 + raidz_parity(rm); | |
702 | ||
703 | corrupt_colums(rm, tgtidx+1, 2); | |
704 | ||
705 | if (!opts->rto_sanity) | |
706 | vdev_raidz_reconstruct(rm, tgtidx, 3); | |
707 | ||
708 | if (cmp_data(opts, rm) != 0) { | |
709 | err++; | |
710 | LOG(D_DEBUG, "\nREC D[%d %d]... " | |
711 | "[FAIL]\n", x0, x1); | |
712 | } | |
713 | } | |
714 | } | |
715 | } else { | |
716 | /* can reconstruct 3 failed data disk */ | |
02730c33 | 717 | for (x0 = 0; x0 < opts->rto_dcols; x0++) { |
b2255edc | 718 | if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) |
ab9f4b0b | 719 | continue; |
02730c33 | 720 | for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { |
b2255edc BB |
721 | if (x1 >= rm->rm_row[0]->rr_cols - |
722 | raidz_parity(rm)) | |
ab9f4b0b | 723 | continue; |
02730c33 | 724 | for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { |
b2255edc BB |
725 | if (x2 >= rm->rm_row[0]->rr_cols - |
726 | raidz_parity(rm)) | |
ab9f4b0b GN |
727 | continue; |
728 | ||
292d573e GN |
729 | /* Check if should stop */ |
730 | if (rto_opts.rto_should_stop) | |
731 | return (err); | |
732 | ||
ab9f4b0b GN |
733 | LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); |
734 | ||
735 | tgtidx[0] = x0 + raidz_parity(rm); | |
736 | tgtidx[1] = x1 + raidz_parity(rm); | |
737 | tgtidx[2] = x2 + raidz_parity(rm); | |
738 | ||
739 | corrupt_colums(rm, tgtidx, 3); | |
740 | ||
741 | if (!opts->rto_sanity) | |
742 | vdev_raidz_reconstruct(rm, | |
02730c33 | 743 | tgtidx, 3); |
ab9f4b0b GN |
744 | |
745 | if (cmp_data(opts, rm) != 0) { | |
746 | err++; | |
747 | LOG(D_DEBUG, | |
748 | "\nREC D[%d %d %d]... " | |
749 | "[FAIL]\n", x0, x1, x2); | |
750 | } | |
751 | } | |
752 | } | |
753 | } | |
754 | } | |
755 | return (err); | |
756 | } | |
757 | ||
758 | static int | |
759 | run_rec_check(raidz_test_opts_t *opts) | |
760 | { | |
761 | char **impl_name; | |
762 | unsigned fn, err = 0; | |
763 | zio_t *zio_test; | |
764 | raidz_map_t *rm_test; | |
765 | ||
766 | err = init_raidz_golden_map(opts, PARITY_PQR); | |
767 | if (0 != err) | |
768 | return (err); | |
769 | ||
770 | LOG(D_INFO, DBLSEP); | |
771 | LOG(D_INFO, "Testing data reconstruction...\n"); | |
772 | ||
773 | for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; | |
774 | impl_name++) { | |
775 | ||
776 | LOG(D_INFO, SEP); | |
777 | LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); | |
778 | ||
779 | if (vdev_raidz_impl_set(*impl_name) != 0) { | |
780 | LOG(D_INFO, "[SKIP]\n"); | |
781 | continue; | |
782 | } else | |
783 | LOG(D_INFO, "[SUPPORTED]\n"); | |
784 | ||
785 | ||
786 | /* create suitable raidz_map */ | |
787 | rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); | |
788 | /* generate parity */ | |
789 | vdev_raidz_generate_parity(rm_test); | |
790 | ||
791 | for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { | |
792 | ||
793 | LOG(D_INFO, "\t\tTesting method [%s] ...", | |
02730c33 | 794 | raidz_rec_name[fn]); |
ab9f4b0b GN |
795 | |
796 | if (run_rec_check_impl(opts, rm_test, fn) != 0) { | |
797 | LOG(D_INFO, "[FAIL]\n"); | |
798 | err++; | |
799 | ||
800 | } else | |
801 | LOG(D_INFO, "[PASS]\n"); | |
802 | ||
803 | } | |
804 | /* tear down test raidz_map */ | |
805 | fini_raidz_map(&zio_test, &rm_test); | |
806 | } | |
807 | ||
808 | fini_raidz_map(&opts->zio_golden, &opts->rm_golden); | |
809 | ||
810 | return (err); | |
811 | } | |
812 | ||
813 | static int | |
814 | run_test(raidz_test_opts_t *opts) | |
815 | { | |
816 | int err = 0; | |
817 | ||
818 | if (opts == NULL) | |
819 | opts = &rto_opts; | |
820 | ||
821 | print_opts(opts, B_FALSE); | |
822 | ||
823 | err |= run_gen_check(opts); | |
824 | err |= run_rec_check(opts); | |
825 | ||
826 | return (err); | |
827 | } | |
828 | ||
829 | #define SWEEP_RUNNING 0 | |
830 | #define SWEEP_FINISHED 1 | |
831 | #define SWEEP_ERROR 2 | |
832 | #define SWEEP_TIMEOUT 3 | |
833 | ||
834 | static int sweep_state = 0; | |
835 | static raidz_test_opts_t failed_opts; | |
836 | ||
837 | static kmutex_t sem_mtx; | |
838 | static kcondvar_t sem_cv; | |
839 | static int max_free_slots; | |
840 | static int free_slots; | |
841 | ||
460748d4 | 842 | static __attribute__((noreturn)) void |
ab9f4b0b GN |
843 | sweep_thread(void *arg) |
844 | { | |
845 | int err = 0; | |
02730c33 | 846 | raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; |
ab9f4b0b GN |
847 | VERIFY(opts != NULL); |
848 | ||
849 | err = run_test(opts); | |
850 | ||
851 | if (rto_opts.rto_sanity) { | |
852 | /* 25% chance that a sweep test fails */ | |
853 | if (rand() < (RAND_MAX/4)) | |
854 | err = 1; | |
855 | } | |
856 | ||
857 | if (0 != err) { | |
858 | mutex_enter(&sem_mtx); | |
859 | memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); | |
860 | sweep_state = SWEEP_ERROR; | |
861 | mutex_exit(&sem_mtx); | |
862 | } | |
863 | ||
864 | umem_free(opts, sizeof (raidz_test_opts_t)); | |
865 | ||
866 | /* signal the next thread */ | |
867 | mutex_enter(&sem_mtx); | |
868 | free_slots++; | |
869 | cv_signal(&sem_cv); | |
870 | mutex_exit(&sem_mtx); | |
871 | ||
872 | thread_exit(); | |
873 | } | |
874 | ||
875 | static int | |
876 | run_sweep(void) | |
877 | { | |
292d573e GN |
878 | static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; |
879 | static const size_t ashift_v[] = { 9, 12, 14 }; | |
ab9f4b0b GN |
880 | static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), |
881 | 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; | |
882 | ||
883 | (void) setvbuf(stdout, NULL, _IONBF, 0); | |
884 | ||
885 | ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * | |
292d573e | 886 | ARRAY_SIZE(dcols_v); |
ab9f4b0b GN |
887 | ulong_t tried_comb = 0; |
888 | hrtime_t time_diff, start_time = gethrtime(); | |
889 | raidz_test_opts_t *opts; | |
292d573e | 890 | int a, d, s; |
ab9f4b0b GN |
891 | |
892 | max_free_slots = free_slots = MAX(2, boot_ncpus); | |
893 | ||
894 | mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); | |
895 | cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); | |
896 | ||
897 | for (s = 0; s < ARRAY_SIZE(size_v); s++) | |
898 | for (a = 0; a < ARRAY_SIZE(ashift_v); a++) | |
ab9f4b0b GN |
899 | for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { |
900 | ||
292d573e | 901 | if (size_v[s] < (1 << ashift_v[a])) { |
ab9f4b0b GN |
902 | total_comb--; |
903 | continue; | |
904 | } | |
905 | ||
906 | if (++tried_comb % 20 == 0) | |
907 | LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); | |
908 | ||
909 | /* wait for signal to start new thread */ | |
910 | mutex_enter(&sem_mtx); | |
911 | while (cv_timedwait_sig(&sem_cv, &sem_mtx, | |
912 | ddi_get_lbolt() + hz)) { | |
913 | ||
914 | /* check if should stop the test (timeout) */ | |
915 | time_diff = (gethrtime() - start_time) / NANOSEC; | |
916 | if (rto_opts.rto_sweep_timeout > 0 && | |
917 | time_diff >= rto_opts.rto_sweep_timeout) { | |
918 | sweep_state = SWEEP_TIMEOUT; | |
292d573e | 919 | rto_opts.rto_should_stop = B_TRUE; |
ab9f4b0b GN |
920 | mutex_exit(&sem_mtx); |
921 | goto exit; | |
922 | } | |
923 | ||
924 | /* check if should stop the test (error) */ | |
925 | if (sweep_state != SWEEP_RUNNING) { | |
926 | mutex_exit(&sem_mtx); | |
927 | goto exit; | |
928 | } | |
929 | ||
930 | /* exit loop if a slot is available */ | |
931 | if (free_slots > 0) { | |
932 | break; | |
933 | } | |
934 | } | |
935 | ||
936 | free_slots--; | |
937 | mutex_exit(&sem_mtx); | |
938 | ||
939 | opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); | |
940 | opts->rto_ashift = ashift_v[a]; | |
941 | opts->rto_dcols = dcols_v[d]; | |
292d573e | 942 | opts->rto_offset = (1 << ashift_v[a]) * rand(); |
ab9f4b0b | 943 | opts->rto_dsize = size_v[s]; |
b2255edc BB |
944 | opts->rto_expand = rto_opts.rto_expand; |
945 | opts->rto_expand_offset = rto_opts.rto_expand_offset; | |
ab9f4b0b GN |
946 | opts->rto_v = 0; /* be quiet */ |
947 | ||
c25b8f99 BB |
948 | VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, |
949 | 0, NULL, TS_RUN, defclsyspri), !=, NULL); | |
ab9f4b0b GN |
950 | } |
951 | ||
952 | exit: | |
953 | LOG(D_ALL, "\nWaiting for test threads to finish...\n"); | |
954 | mutex_enter(&sem_mtx); | |
955 | VERIFY(free_slots <= max_free_slots); | |
956 | while (free_slots < max_free_slots) { | |
957 | (void) cv_wait(&sem_cv, &sem_mtx); | |
958 | } | |
959 | mutex_exit(&sem_mtx); | |
960 | ||
961 | if (sweep_state == SWEEP_ERROR) { | |
962 | ERR("Sweep test failed! Failed option: \n"); | |
963 | print_opts(&failed_opts, B_TRUE); | |
964 | } else { | |
965 | if (sweep_state == SWEEP_TIMEOUT) | |
966 | LOG(D_ALL, "Test timeout (%lus). Stopping...\n", | |
967 | (ulong_t)rto_opts.rto_sweep_timeout); | |
968 | ||
969 | LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", | |
970 | (ulong_t)tried_comb); | |
971 | } | |
972 | ||
c17486b2 GN |
973 | mutex_destroy(&sem_mtx); |
974 | ||
ab9f4b0b GN |
975 | return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); |
976 | } | |
977 | ||
b2255edc | 978 | |
ab9f4b0b GN |
979 | int |
980 | main(int argc, char **argv) | |
981 | { | |
982 | size_t i; | |
983 | struct sigaction action; | |
984 | int err = 0; | |
985 | ||
e7238382 AZ |
986 | /* init gdb pid string early */ |
987 | (void) sprintf(pid_s, "%d", getpid()); | |
ab9f4b0b GN |
988 | |
989 | action.sa_handler = sig_handler; | |
990 | sigemptyset(&action.sa_mask); | |
991 | action.sa_flags = 0; | |
992 | ||
993 | if (sigaction(SIGSEGV, &action, NULL) < 0) { | |
994 | ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); | |
995 | exit(EXIT_FAILURE); | |
996 | } | |
997 | ||
998 | (void) setvbuf(stdout, NULL, _IOLBF, 0); | |
999 | ||
1000 | dprintf_setup(&argc, argv); | |
1001 | ||
1002 | process_options(argc, argv); | |
1003 | ||
da92d5cb | 1004 | kernel_init(SPA_MODE_READ); |
ab9f4b0b GN |
1005 | |
1006 | /* setup random data because rand() is not reentrant */ | |
02730c33 | 1007 | rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); |
ab9f4b0b GN |
1008 | srand((unsigned)time(NULL) * getpid()); |
1009 | for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) | |
1010 | rand_data[i] = rand(); | |
1011 | ||
1012 | mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); | |
1013 | ||
1014 | if (rto_opts.rto_benchmark) { | |
1015 | run_raidz_benchmark(); | |
1016 | } else if (rto_opts.rto_sweep) { | |
1017 | err = run_sweep(); | |
1018 | } else { | |
1019 | err = run_test(NULL); | |
1020 | } | |
1021 | ||
1022 | umem_free(rand_data, SPA_MAXBLOCKSIZE); | |
1023 | kernel_fini(); | |
1024 | ||
1025 | return (err); | |
1026 | } |