]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz_math.c
Wait iput_async before evict_inodes to prevent race
[mirror_zfs.git] / module / zfs / vdev_raidz_math.c
CommitLineData
ab9f4b0b
GN
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
23 */
24
25#include <sys/zfs_context.h>
26#include <sys/types.h>
27#include <sys/zio.h>
28#include <sys/debug.h>
29#include <sys/zfs_debug.h>
30
31#include <sys/vdev_raidz.h>
32#include <sys/vdev_raidz_impl.h>
33
ab9f4b0b
GN
34/* All compiled in implementations */
35const raidz_impl_ops_t *raidz_all_maths[] = {
36 &vdev_raidz_scalar_impl,
ae25d222
GN
37#if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
38 &vdev_raidz_sse2_impl,
39#endif
ab9f4b0b 40#if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
ae25d222 41 &vdev_raidz_ssse3_impl,
ab9f4b0b
GN
42#endif
43#if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
44 &vdev_raidz_avx2_impl
45#endif
46};
47
48/* Indicate that benchmark has been completed */
49static boolean_t raidz_math_initialized = B_FALSE;
50
51/* Select raidz implementation */
52static enum vdev_raidz_impl_sel {
53 IMPL_FASTEST = -1,
54 IMPL_ORIGINAL = -2,
55 IMPL_CYCLE = -3,
56 IMPL_SCALAR = 0,
57} zfs_vdev_raidz_impl = IMPL_SCALAR;
58
59/* selected implementation and its lock */
60static krwlock_t vdev_raidz_impl_lock;
61static raidz_impl_ops_t *vdev_raidz_used_impl =
62 (raidz_impl_ops_t *) &vdev_raidz_scalar_impl;
63static boolean_t vdev_raidz_impl_user_set = B_FALSE;
64
65/* RAIDZ op that contain the fastest routines */
66static raidz_impl_ops_t vdev_raidz_fastest_impl = {
67 .name = "fastest"
68};
69
70/* Hold all supported implementations */
71size_t raidz_supp_impl_cnt = 1;
72raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths) + 1] = {
73 (raidz_impl_ops_t *) &vdev_raidz_scalar_impl, /* scalar is supported */
74 NULL
75};
76
77/*
78 * kstats values for supported impl & original methods
79 * Values represent per disk throughput of 8 disk+parity raidz vdev (Bps)
80 */
81static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
82
83/* kstat for benchmarked implementations */
84static kstat_t *raidz_math_kstat = NULL;
85
86/*
87 * Selects the raidz operation for raidz_map
88 * If rm_ops is set to NULL original raidz implementation will be used
89 */
90void
91vdev_raidz_math_get_ops(raidz_map_t *rm)
92{
93 rw_enter(&vdev_raidz_impl_lock, RW_READER);
94
95 rm->rm_ops = vdev_raidz_used_impl;
96
97#if !defined(_KERNEL)
98 if (zfs_vdev_raidz_impl == IMPL_CYCLE) {
99 static size_t cycle_impl_idx = 0;
100 size_t idx;
101 /*
102 * Cycle through all supported new implementations, and
103 * when idx == raidz_supp_impl_cnt, use the original
104 */
105 idx = (++cycle_impl_idx) % (raidz_supp_impl_cnt + 1);
106 rm->rm_ops = raidz_supp_impl[idx];
107 }
108#endif
109
110 rw_exit(&vdev_raidz_impl_lock);
111}
112
113/*
114 * Select parity generation method for raidz_map
115 */
116void
117vdev_raidz_math_generate(raidz_map_t *rm)
118{
119 raidz_gen_f gen_parity = NULL;
120
121 switch (raidz_parity(rm)) {
122 case 1:
123 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
124 break;
125 case 2:
126 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
127 break;
128 case 3:
129 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
130 break;
131 default:
132 gen_parity = NULL;
133 cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
134 raidz_parity(rm));
135 break;
136 }
137
138 ASSERT(gen_parity != NULL);
139
140 gen_parity(rm);
141}
142
143static raidz_rec_f
144_reconstruct_fun_raidz1(raidz_map_t *rm, const int *parity_valid,
145 const int nbaddata)
146{
147 if (nbaddata == 1 && parity_valid[CODE_P]) {
148 return (rm->rm_ops->rec[RAIDZ_REC_P]);
149 }
150 return ((raidz_rec_f) NULL);
151}
152
153static raidz_rec_f
154_reconstruct_fun_raidz2(raidz_map_t *rm, const int *parity_valid,
155 const int nbaddata)
156{
157 if (nbaddata == 1) {
158 if (parity_valid[CODE_P]) {
159 return (rm->rm_ops->rec[RAIDZ_REC_P]);
160 } else if (parity_valid[CODE_Q]) {
161 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
162 }
163 } else if (nbaddata == 2 &&
164 parity_valid[CODE_P] && parity_valid[CODE_Q]) {
165 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
166 }
167 return ((raidz_rec_f) NULL);
168}
169
170static raidz_rec_f
171_reconstruct_fun_raidz3(raidz_map_t *rm, const int *parity_valid,
172 const int nbaddata)
173{
174 if (nbaddata == 1) {
175 if (parity_valid[CODE_P]) {
176 return (rm->rm_ops->rec[RAIDZ_REC_P]);
177 } else if (parity_valid[CODE_Q]) {
178 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
179 } else if (parity_valid[CODE_R]) {
180 return (rm->rm_ops->rec[RAIDZ_REC_R]);
181 }
182 } else if (nbaddata == 2) {
183 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
184 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
185 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
186 return (rm->rm_ops->rec[RAIDZ_REC_PR]);
187 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
188 return (rm->rm_ops->rec[RAIDZ_REC_QR]);
189 }
190 } else if (nbaddata == 3 &&
191 parity_valid[CODE_P] && parity_valid[CODE_Q] &&
192 parity_valid[CODE_R]) {
193 return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
194 }
195 return ((raidz_rec_f) NULL);
196}
197
198/*
199 * Select data reconstruction method for raidz_map
200 * @parity_valid - Parity validity flag
201 * @dt - Failed data index array
202 * @nbaddata - Number of failed data columns
203 */
204int
205vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
206 const int *dt, const int nbaddata)
207{
208 raidz_rec_f rec_data = NULL;
209
210 switch (raidz_parity(rm)) {
211 case 1:
212 rec_data = _reconstruct_fun_raidz1(rm, parity_valid,
213 nbaddata);
214 break;
215 case 2:
216 rec_data = _reconstruct_fun_raidz2(rm, parity_valid,
217 nbaddata);
218 break;
219 case 3:
220 rec_data = _reconstruct_fun_raidz3(rm, parity_valid,
221 nbaddata);
222 break;
223 default:
224 cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
225 raidz_parity(rm));
226 break;
227 }
228
229 ASSERT(rec_data != NULL);
230
231 return (rec_data(rm, dt));
232}
233
234const char *raidz_gen_name[] = {
235 "gen_p", "gen_pq", "gen_pqr"
236};
237const char *raidz_rec_name[] = {
238 "rec_p", "rec_q", "rec_r",
239 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
240};
241
242static void
243init_raidz_kstat(raidz_impl_kstat_t *rs, const char *name)
244{
245 int i;
246 const size_t impl_name_len = strnlen(name, KSTAT_STRLEN);
247 const size_t op_name_max = (KSTAT_STRLEN - 2) > impl_name_len ?
248 KSTAT_STRLEN - impl_name_len - 2 : 0;
249
250 for (i = 0; i < RAIDZ_GEN_NUM; i++) {
251 strncpy(rs->gen[i].name, name, impl_name_len);
252 strncpy(rs->gen[i].name + impl_name_len, "_", 1);
253 strncpy(rs->gen[i].name + impl_name_len + 1,
254 raidz_gen_name[i], op_name_max);
255
256 rs->gen[i].data_type = KSTAT_DATA_UINT64;
257 rs->gen[i].value.ui64 = 0;
258 }
259
260 for (i = 0; i < RAIDZ_REC_NUM; i++) {
261 strncpy(rs->rec[i].name, name, impl_name_len);
262 strncpy(rs->rec[i].name + impl_name_len, "_", 1);
263 strncpy(rs->rec[i].name + impl_name_len + 1,
264 raidz_rec_name[i], op_name_max);
265
266 rs->rec[i].data_type = KSTAT_DATA_UINT64;
267 rs->rec[i].value.ui64 = 0;
268 }
269}
270
271#define BENCH_D_COLS (8ULL)
272#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
590c9a09 273#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
ab9f4b0b
GN
274#define BENCH_NS MSEC2NSEC(25) /* 25ms */
275
276typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
277
278static void
279benchmark_gen_impl(raidz_map_t *rm, const int fn)
280{
281 (void) fn;
282 vdev_raidz_generate_parity(rm);
283}
284
285static void
286benchmark_rec_impl(raidz_map_t *rm, const int fn)
287{
288 static const int rec_tgt[7][3] = {
289 {1, 2, 3}, /* rec_p: bad QR & D[0] */
290 {0, 2, 3}, /* rec_q: bad PR & D[0] */
291 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
292 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
293 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
294 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
295 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
296 };
297
298 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
299}
300
301/*
302 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
303 * is performed by setting the rm_ops pointer and calling the top level
304 * generate/reconstruct methods of bench_rm.
305 */
306static void
307benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
308{
309 uint64_t run_cnt, speed, best_speed = 0;
310 hrtime_t t_start, t_diff;
311 raidz_impl_ops_t *curr_impl;
312 int impl, i;
313
314 /*
315 * Use the sentinel (NULL) from the end of raidz_supp_impl_cnt
316 * to run "original" implementation (bench_rm->rm_ops = NULL)
317 */
318 for (impl = 0; impl <= raidz_supp_impl_cnt; impl++) {
319 /* set an implementation to benchmark */
320 curr_impl = raidz_supp_impl[impl];
321 bench_rm->rm_ops = curr_impl;
322
323 run_cnt = 0;
324 t_start = gethrtime();
325
326 do {
327 for (i = 0; i < 25; i++, run_cnt++)
328 bench_fn(bench_rm, fn);
329
330 t_diff = gethrtime() - t_start;
331 } while (t_diff < BENCH_NS);
332
333 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
334 speed /= (t_diff * BENCH_COLS);
335
336 if (bench_fn == benchmark_gen_impl)
337 raidz_impl_kstats[impl].gen[fn].value.ui64 = speed;
338 else
339 raidz_impl_kstats[impl].rec[fn].value.ui64 = speed;
340
341 /* if curr_impl==NULL the original impl is benchmarked */
342 if (curr_impl != NULL && speed > best_speed) {
343 best_speed = speed;
344
345 if (bench_fn == benchmark_gen_impl)
346 vdev_raidz_fastest_impl.gen[fn] =
347 curr_impl->gen[fn];
348 else
349 vdev_raidz_fastest_impl.rec[fn] =
350 curr_impl->rec[fn];
351 }
352 }
353}
354
355void
356vdev_raidz_math_init(void)
357{
358 raidz_impl_ops_t *curr_impl;
359 zio_t *bench_zio = NULL;
360 raidz_map_t *bench_rm = NULL;
361 uint64_t bench_parity;
362 int i, c, fn;
363
364 /* init & vdev_raidz_impl_lock */
365 rw_init(&vdev_raidz_impl_lock, NULL, RW_DEFAULT, NULL);
366
367 /* move supported impl into raidz_supp_impl */
368 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
369 curr_impl = (raidz_impl_ops_t *) raidz_all_maths[i];
370
371 /* initialize impl */
372 if (curr_impl->init)
373 curr_impl->init();
374
375 if (curr_impl->is_supported()) {
376 /* init kstat */
377 init_raidz_kstat(&raidz_impl_kstats[c],
378 curr_impl->name);
379 raidz_supp_impl[c++] = (raidz_impl_ops_t *) curr_impl;
380 }
381 }
382 raidz_supp_impl_cnt = c; /* number of supported impl */
383 raidz_supp_impl[c] = NULL; /* sentinel */
384
385 /* init kstat for original routines */
386 init_raidz_kstat(&(raidz_impl_kstats[raidz_supp_impl_cnt]), "original");
387
388#if !defined(_KERNEL)
389 /*
390 * Skip benchmarking and use last implementation as fastest
391 */
392 memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
393 sizeof (vdev_raidz_fastest_impl));
394
395 vdev_raidz_fastest_impl.name = "fastest";
396
397 raidz_math_initialized = B_TRUE;
398
399 /* Use 'cycle' math selection method for userspace */
400 VERIFY0(vdev_raidz_impl_set("cycle"));
401 return;
402#endif
403
404 /* Fake an zio and run the benchmark on it */
405 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
406 bench_zio->io_offset = 0;
407 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
408 bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE);
409 VERIFY(bench_zio->io_data);
410
411 /* Benchmark parity generation methods */
412 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
413 bench_parity = fn + 1;
414 /* New raidz_map is needed for each generate_p/q/r */
415 bench_rm = vdev_raidz_map_alloc(bench_zio, 9,
416 BENCH_D_COLS + bench_parity, bench_parity);
417
418 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
419
420 vdev_raidz_map_free(bench_rm);
421 }
422
423 /* Benchmark data reconstruction methods */
424 bench_rm = vdev_raidz_map_alloc(bench_zio, 9, BENCH_COLS, PARITY_PQR);
425
426 for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
427 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
428
429 vdev_raidz_map_free(bench_rm);
430
431 /* cleanup the bench zio */
432 zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE);
433 kmem_free(bench_zio, sizeof (zio_t));
434
435 /* install kstats for all impl */
436 raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench",
437 "misc", KSTAT_TYPE_NAMED,
438 sizeof (raidz_impl_kstat_t) / sizeof (kstat_named_t) *
439 (raidz_supp_impl_cnt + 1), KSTAT_FLAG_VIRTUAL);
440
441 if (raidz_math_kstat != NULL) {
442 raidz_math_kstat->ks_data = raidz_impl_kstats;
443 kstat_install(raidz_math_kstat);
444 }
445
446 /* Finish initialization */
447 raidz_math_initialized = B_TRUE;
448 if (!vdev_raidz_impl_user_set)
449 VERIFY0(vdev_raidz_impl_set("fastest"));
450}
451
452void
453vdev_raidz_math_fini(void)
454{
455 raidz_impl_ops_t const *curr_impl;
456 int i;
457
458 if (raidz_math_kstat != NULL) {
459 kstat_delete(raidz_math_kstat);
460 raidz_math_kstat = NULL;
461 }
462
463 rw_destroy(&vdev_raidz_impl_lock);
464
465 /* fini impl */
466 for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
467 curr_impl = raidz_all_maths[i];
468
469 if (curr_impl->fini)
470 curr_impl->fini();
471 }
472}
473
474static const
475struct {
476 char *name;
477 raidz_impl_ops_t *impl;
478 enum vdev_raidz_impl_sel sel;
479} math_impl_opts[] = {
480 { "fastest", &vdev_raidz_fastest_impl, IMPL_FASTEST },
481 { "original", NULL, IMPL_ORIGINAL },
482#if !defined(_KERNEL)
483 { "cycle", NULL, IMPL_CYCLE },
484#endif
485};
486
487/*
488 * Function sets desired raidz implementation.
489 * If called after module_init(), vdev_raidz_impl_lock must be held for writing.
490 *
491 * @val Name of raidz implementation to use
492 * @param Unused.
493 */
494static int
495zfs_vdev_raidz_impl_set(const char *val, struct kernel_param *kp)
496{
497 size_t i;
498
499 /* Check mandatory options */
500 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
501 if (strcmp(val, math_impl_opts[i].name) == 0) {
502 zfs_vdev_raidz_impl = math_impl_opts[i].sel;
503 vdev_raidz_used_impl = math_impl_opts[i].impl;
504 vdev_raidz_impl_user_set = B_TRUE;
505 return (0);
506 }
507 }
508
509 /* check all supported implementations */
510 for (i = 0; i < raidz_supp_impl_cnt; i++) {
511 if (strcmp(val, raidz_supp_impl[i]->name) == 0) {
512 zfs_vdev_raidz_impl = i;
513 vdev_raidz_used_impl = raidz_supp_impl[i];
514 vdev_raidz_impl_user_set = B_TRUE;
515 return (0);
516 }
517 }
518
519 return (-EINVAL);
520}
521
522int
523vdev_raidz_impl_set(const char *val)
524{
525 int err;
526
527 ASSERT(raidz_math_initialized);
528
529 rw_enter(&vdev_raidz_impl_lock, RW_WRITER);
530 err = zfs_vdev_raidz_impl_set(val, NULL);
531 rw_exit(&vdev_raidz_impl_lock);
532 return (err);
533}
534
535#if defined(_KERNEL) && defined(HAVE_SPL)
536static int
537zfs_vdev_raidz_impl_get(char *buffer, struct kernel_param *kp)
538{
539 int i, cnt = 0;
540 char *fmt;
541
542 ASSERT(raidz_math_initialized);
543
544 rw_enter(&vdev_raidz_impl_lock, RW_READER);
545
546 /* list mandatory options */
547 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
548 if (math_impl_opts[i].sel == zfs_vdev_raidz_impl)
549 fmt = "[%s] ";
550 else
551 fmt = "%s ";
552
553 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
554 }
555
556 /* list all supported implementations */
557 for (i = 0; i < raidz_supp_impl_cnt; i++) {
558 fmt = (i == zfs_vdev_raidz_impl) ? "[%s] " : "%s ";
559 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
560 }
561
562 rw_exit(&vdev_raidz_impl_lock);
563
564 return (cnt);
565}
566
567module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
568 zfs_vdev_raidz_impl_get, NULL, 0644);
569MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
570#endif