]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_raidz_math.c
SIMD implementation of vdev_raidz generate and reconstruct routines
[mirror_zfs.git] / module / zfs / vdev_raidz_math.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
23 */
24
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
27 #include <sys/zio.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
30
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
33
34 extern const raidz_impl_ops_t vdev_raidz_scalar_impl;
35 extern const raidz_impl_ops_t vdev_raidz_sse_impl;
36 extern const raidz_impl_ops_t vdev_raidz_avx2_impl;
37
38 /* All compiled in implementations */
39 const raidz_impl_ops_t *raidz_all_maths[] = {
40 &vdev_raidz_scalar_impl,
41 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
42 &vdev_raidz_sse_impl,
43 #endif
44 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
45 &vdev_raidz_avx2_impl
46 #endif
47 };
48
49 /* Indicate that benchmark has been completed */
50 static boolean_t raidz_math_initialized = B_FALSE;
51
52 /* Select raidz implementation */
53 static enum vdev_raidz_impl_sel {
54 IMPL_FASTEST = -1,
55 IMPL_ORIGINAL = -2,
56 IMPL_CYCLE = -3,
57 IMPL_SCALAR = 0,
58 } zfs_vdev_raidz_impl = IMPL_SCALAR;
59
60 /* selected implementation and its lock */
61 static krwlock_t vdev_raidz_impl_lock;
62 static raidz_impl_ops_t *vdev_raidz_used_impl =
63 (raidz_impl_ops_t *) &vdev_raidz_scalar_impl;
64 static boolean_t vdev_raidz_impl_user_set = B_FALSE;
65
66 /* RAIDZ op that contain the fastest routines */
67 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
68 .name = "fastest"
69 };
70
71 /* Hold all supported implementations */
72 size_t raidz_supp_impl_cnt = 1;
73 raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths) + 1] = {
74 (raidz_impl_ops_t *) &vdev_raidz_scalar_impl, /* scalar is supported */
75 NULL
76 };
77
78 /*
79 * kstats values for supported impl & original methods
80 * Values represent per disk throughput of 8 disk+parity raidz vdev (Bps)
81 */
82 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
83
84 /* kstat for benchmarked implementations */
85 static kstat_t *raidz_math_kstat = NULL;
86
87 /*
88 * Selects the raidz operation for raidz_map
89 * If rm_ops is set to NULL original raidz implementation will be used
90 */
91 void
92 vdev_raidz_math_get_ops(raidz_map_t *rm)
93 {
94 rw_enter(&vdev_raidz_impl_lock, RW_READER);
95
96 rm->rm_ops = vdev_raidz_used_impl;
97
98 #if !defined(_KERNEL)
99 if (zfs_vdev_raidz_impl == IMPL_CYCLE) {
100 static size_t cycle_impl_idx = 0;
101 size_t idx;
102 /*
103 * Cycle through all supported new implementations, and
104 * when idx == raidz_supp_impl_cnt, use the original
105 */
106 idx = (++cycle_impl_idx) % (raidz_supp_impl_cnt + 1);
107 rm->rm_ops = raidz_supp_impl[idx];
108 }
109 #endif
110
111 rw_exit(&vdev_raidz_impl_lock);
112 }
113
114 /*
115 * Select parity generation method for raidz_map
116 */
117 void
118 vdev_raidz_math_generate(raidz_map_t *rm)
119 {
120 raidz_gen_f gen_parity = NULL;
121
122 switch (raidz_parity(rm)) {
123 case 1:
124 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
125 break;
126 case 2:
127 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
128 break;
129 case 3:
130 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
131 break;
132 default:
133 gen_parity = NULL;
134 cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
135 raidz_parity(rm));
136 break;
137 }
138
139 ASSERT(gen_parity != NULL);
140
141 gen_parity(rm);
142 }
143
144 static raidz_rec_f
145 _reconstruct_fun_raidz1(raidz_map_t *rm, const int *parity_valid,
146 const int nbaddata)
147 {
148 if (nbaddata == 1 && parity_valid[CODE_P]) {
149 return (rm->rm_ops->rec[RAIDZ_REC_P]);
150 }
151 return ((raidz_rec_f) NULL);
152 }
153
154 static raidz_rec_f
155 _reconstruct_fun_raidz2(raidz_map_t *rm, const int *parity_valid,
156 const int nbaddata)
157 {
158 if (nbaddata == 1) {
159 if (parity_valid[CODE_P]) {
160 return (rm->rm_ops->rec[RAIDZ_REC_P]);
161 } else if (parity_valid[CODE_Q]) {
162 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
163 }
164 } else if (nbaddata == 2 &&
165 parity_valid[CODE_P] && parity_valid[CODE_Q]) {
166 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
167 }
168 return ((raidz_rec_f) NULL);
169 }
170
171 static raidz_rec_f
172 _reconstruct_fun_raidz3(raidz_map_t *rm, const int *parity_valid,
173 const int nbaddata)
174 {
175 if (nbaddata == 1) {
176 if (parity_valid[CODE_P]) {
177 return (rm->rm_ops->rec[RAIDZ_REC_P]);
178 } else if (parity_valid[CODE_Q]) {
179 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
180 } else if (parity_valid[CODE_R]) {
181 return (rm->rm_ops->rec[RAIDZ_REC_R]);
182 }
183 } else if (nbaddata == 2) {
184 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
185 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
186 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
187 return (rm->rm_ops->rec[RAIDZ_REC_PR]);
188 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
189 return (rm->rm_ops->rec[RAIDZ_REC_QR]);
190 }
191 } else if (nbaddata == 3 &&
192 parity_valid[CODE_P] && parity_valid[CODE_Q] &&
193 parity_valid[CODE_R]) {
194 return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
195 }
196 return ((raidz_rec_f) NULL);
197 }
198
199 /*
200 * Select data reconstruction method for raidz_map
201 * @parity_valid - Parity validity flag
202 * @dt - Failed data index array
203 * @nbaddata - Number of failed data columns
204 */
205 int
206 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
207 const int *dt, const int nbaddata)
208 {
209 raidz_rec_f rec_data = NULL;
210
211 switch (raidz_parity(rm)) {
212 case 1:
213 rec_data = _reconstruct_fun_raidz1(rm, parity_valid,
214 nbaddata);
215 break;
216 case 2:
217 rec_data = _reconstruct_fun_raidz2(rm, parity_valid,
218 nbaddata);
219 break;
220 case 3:
221 rec_data = _reconstruct_fun_raidz3(rm, parity_valid,
222 nbaddata);
223 break;
224 default:
225 cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
226 raidz_parity(rm));
227 break;
228 }
229
230 ASSERT(rec_data != NULL);
231
232 return (rec_data(rm, dt));
233 }
234
235 const char *raidz_gen_name[] = {
236 "gen_p", "gen_pq", "gen_pqr"
237 };
238 const char *raidz_rec_name[] = {
239 "rec_p", "rec_q", "rec_r",
240 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
241 };
242
243 static void
244 init_raidz_kstat(raidz_impl_kstat_t *rs, const char *name)
245 {
246 int i;
247 const size_t impl_name_len = strnlen(name, KSTAT_STRLEN);
248 const size_t op_name_max = (KSTAT_STRLEN - 2) > impl_name_len ?
249 KSTAT_STRLEN - impl_name_len - 2 : 0;
250
251 for (i = 0; i < RAIDZ_GEN_NUM; i++) {
252 strncpy(rs->gen[i].name, name, impl_name_len);
253 strncpy(rs->gen[i].name + impl_name_len, "_", 1);
254 strncpy(rs->gen[i].name + impl_name_len + 1,
255 raidz_gen_name[i], op_name_max);
256
257 rs->gen[i].data_type = KSTAT_DATA_UINT64;
258 rs->gen[i].value.ui64 = 0;
259 }
260
261 for (i = 0; i < RAIDZ_REC_NUM; i++) {
262 strncpy(rs->rec[i].name, name, impl_name_len);
263 strncpy(rs->rec[i].name + impl_name_len, "_", 1);
264 strncpy(rs->rec[i].name + impl_name_len + 1,
265 raidz_rec_name[i], op_name_max);
266
267 rs->rec[i].data_type = KSTAT_DATA_UINT64;
268 rs->rec[i].value.ui64 = 0;
269 }
270 }
271
272 #define BENCH_D_COLS (8ULL)
273 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
274 #define BENCH_ZIO_SIZE (2ULL << 17) /* 128 kiB */
275 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
276
277 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
278
279 static void
280 benchmark_gen_impl(raidz_map_t *rm, const int fn)
281 {
282 (void) fn;
283 vdev_raidz_generate_parity(rm);
284 }
285
286 static void
287 benchmark_rec_impl(raidz_map_t *rm, const int fn)
288 {
289 static const int rec_tgt[7][3] = {
290 {1, 2, 3}, /* rec_p: bad QR & D[0] */
291 {0, 2, 3}, /* rec_q: bad PR & D[0] */
292 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
293 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
294 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
295 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
296 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
297 };
298
299 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
300 }
301
302 /*
303 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
304 * is performed by setting the rm_ops pointer and calling the top level
305 * generate/reconstruct methods of bench_rm.
306 */
307 static void
308 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
309 {
310 uint64_t run_cnt, speed, best_speed = 0;
311 hrtime_t t_start, t_diff;
312 raidz_impl_ops_t *curr_impl;
313 int impl, i;
314
315 /*
316 * Use the sentinel (NULL) from the end of raidz_supp_impl_cnt
317 * to run "original" implementation (bench_rm->rm_ops = NULL)
318 */
319 for (impl = 0; impl <= raidz_supp_impl_cnt; impl++) {
320 /* set an implementation to benchmark */
321 curr_impl = raidz_supp_impl[impl];
322 bench_rm->rm_ops = curr_impl;
323
324 run_cnt = 0;
325 t_start = gethrtime();
326
327 do {
328 for (i = 0; i < 25; i++, run_cnt++)
329 bench_fn(bench_rm, fn);
330
331 t_diff = gethrtime() - t_start;
332 } while (t_diff < BENCH_NS);
333
334 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
335 speed /= (t_diff * BENCH_COLS);
336
337 if (bench_fn == benchmark_gen_impl)
338 raidz_impl_kstats[impl].gen[fn].value.ui64 = speed;
339 else
340 raidz_impl_kstats[impl].rec[fn].value.ui64 = speed;
341
342 /* if curr_impl==NULL the original impl is benchmarked */
343 if (curr_impl != NULL && speed > best_speed) {
344 best_speed = speed;
345
346 if (bench_fn == benchmark_gen_impl)
347 vdev_raidz_fastest_impl.gen[fn] =
348 curr_impl->gen[fn];
349 else
350 vdev_raidz_fastest_impl.rec[fn] =
351 curr_impl->rec[fn];
352 }
353 }
354 }
355
356 void
357 vdev_raidz_math_init(void)
358 {
359 raidz_impl_ops_t *curr_impl;
360 zio_t *bench_zio = NULL;
361 raidz_map_t *bench_rm = NULL;
362 uint64_t bench_parity;
363 int i, c, fn;
364
365 /* init & vdev_raidz_impl_lock */
366 rw_init(&vdev_raidz_impl_lock, NULL, RW_DEFAULT, NULL);
367
368 /* move supported impl into raidz_supp_impl */
369 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
370 curr_impl = (raidz_impl_ops_t *) raidz_all_maths[i];
371
372 /* initialize impl */
373 if (curr_impl->init)
374 curr_impl->init();
375
376 if (curr_impl->is_supported()) {
377 /* init kstat */
378 init_raidz_kstat(&raidz_impl_kstats[c],
379 curr_impl->name);
380 raidz_supp_impl[c++] = (raidz_impl_ops_t *) curr_impl;
381 }
382 }
383 raidz_supp_impl_cnt = c; /* number of supported impl */
384 raidz_supp_impl[c] = NULL; /* sentinel */
385
386 /* init kstat for original routines */
387 init_raidz_kstat(&(raidz_impl_kstats[raidz_supp_impl_cnt]), "original");
388
389 #if !defined(_KERNEL)
390 /*
391 * Skip benchmarking and use last implementation as fastest
392 */
393 memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
394 sizeof (vdev_raidz_fastest_impl));
395
396 vdev_raidz_fastest_impl.name = "fastest";
397
398 raidz_math_initialized = B_TRUE;
399
400 /* Use 'cycle' math selection method for userspace */
401 VERIFY0(vdev_raidz_impl_set("cycle"));
402 return;
403 #endif
404
405 /* Fake an zio and run the benchmark on it */
406 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
407 bench_zio->io_offset = 0;
408 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
409 bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE);
410 VERIFY(bench_zio->io_data);
411
412 /* Benchmark parity generation methods */
413 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
414 bench_parity = fn + 1;
415 /* New raidz_map is needed for each generate_p/q/r */
416 bench_rm = vdev_raidz_map_alloc(bench_zio, 9,
417 BENCH_D_COLS + bench_parity, bench_parity);
418
419 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
420
421 vdev_raidz_map_free(bench_rm);
422 }
423
424 /* Benchmark data reconstruction methods */
425 bench_rm = vdev_raidz_map_alloc(bench_zio, 9, BENCH_COLS, PARITY_PQR);
426
427 for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
428 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
429
430 vdev_raidz_map_free(bench_rm);
431
432 /* cleanup the bench zio */
433 zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE);
434 kmem_free(bench_zio, sizeof (zio_t));
435
436 /* install kstats for all impl */
437 raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench",
438 "misc", KSTAT_TYPE_NAMED,
439 sizeof (raidz_impl_kstat_t) / sizeof (kstat_named_t) *
440 (raidz_supp_impl_cnt + 1), KSTAT_FLAG_VIRTUAL);
441
442 if (raidz_math_kstat != NULL) {
443 raidz_math_kstat->ks_data = raidz_impl_kstats;
444 kstat_install(raidz_math_kstat);
445 }
446
447 /* Finish initialization */
448 raidz_math_initialized = B_TRUE;
449 if (!vdev_raidz_impl_user_set)
450 VERIFY0(vdev_raidz_impl_set("fastest"));
451 }
452
453 void
454 vdev_raidz_math_fini(void)
455 {
456 raidz_impl_ops_t const *curr_impl;
457 int i;
458
459 if (raidz_math_kstat != NULL) {
460 kstat_delete(raidz_math_kstat);
461 raidz_math_kstat = NULL;
462 }
463
464 rw_destroy(&vdev_raidz_impl_lock);
465
466 /* fini impl */
467 for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
468 curr_impl = raidz_all_maths[i];
469
470 if (curr_impl->fini)
471 curr_impl->fini();
472 }
473 }
474
475 static const
476 struct {
477 char *name;
478 raidz_impl_ops_t *impl;
479 enum vdev_raidz_impl_sel sel;
480 } math_impl_opts[] = {
481 { "fastest", &vdev_raidz_fastest_impl, IMPL_FASTEST },
482 { "original", NULL, IMPL_ORIGINAL },
483 #if !defined(_KERNEL)
484 { "cycle", NULL, IMPL_CYCLE },
485 #endif
486 };
487
488 /*
489 * Function sets desired raidz implementation.
490 * If called after module_init(), vdev_raidz_impl_lock must be held for writing.
491 *
492 * @val Name of raidz implementation to use
493 * @param Unused.
494 */
495 static int
496 zfs_vdev_raidz_impl_set(const char *val, struct kernel_param *kp)
497 {
498 size_t i;
499
500 /* Check mandatory options */
501 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
502 if (strcmp(val, math_impl_opts[i].name) == 0) {
503 zfs_vdev_raidz_impl = math_impl_opts[i].sel;
504 vdev_raidz_used_impl = math_impl_opts[i].impl;
505 vdev_raidz_impl_user_set = B_TRUE;
506 return (0);
507 }
508 }
509
510 /* check all supported implementations */
511 for (i = 0; i < raidz_supp_impl_cnt; i++) {
512 if (strcmp(val, raidz_supp_impl[i]->name) == 0) {
513 zfs_vdev_raidz_impl = i;
514 vdev_raidz_used_impl = raidz_supp_impl[i];
515 vdev_raidz_impl_user_set = B_TRUE;
516 return (0);
517 }
518 }
519
520 return (-EINVAL);
521 }
522
523 int
524 vdev_raidz_impl_set(const char *val)
525 {
526 int err;
527
528 ASSERT(raidz_math_initialized);
529
530 rw_enter(&vdev_raidz_impl_lock, RW_WRITER);
531 err = zfs_vdev_raidz_impl_set(val, NULL);
532 rw_exit(&vdev_raidz_impl_lock);
533 return (err);
534 }
535
536 #if defined(_KERNEL) && defined(HAVE_SPL)
537 static int
538 zfs_vdev_raidz_impl_get(char *buffer, struct kernel_param *kp)
539 {
540 int i, cnt = 0;
541 char *fmt;
542
543 ASSERT(raidz_math_initialized);
544
545 rw_enter(&vdev_raidz_impl_lock, RW_READER);
546
547 /* list mandatory options */
548 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
549 if (math_impl_opts[i].sel == zfs_vdev_raidz_impl)
550 fmt = "[%s] ";
551 else
552 fmt = "%s ";
553
554 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
555 }
556
557 /* list all supported implementations */
558 for (i = 0; i < raidz_supp_impl_cnt; i++) {
559 fmt = (i == zfs_vdev_raidz_impl) ? "[%s] " : "%s ";
560 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
561 }
562
563 rw_exit(&vdev_raidz_impl_lock);
564
565 return (cnt);
566 }
567
568 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
569 zfs_vdev_raidz_impl_get, NULL, 0644);
570 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
571 #endif