4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 extern boolean_t
raidz_will_scalar_work(void);
36 /* Opaque implementation with NULL methods to represent original methods */
37 static const raidz_impl_ops_t vdev_raidz_original_impl
= {
39 .is_supported
= raidz_will_scalar_work
,
42 /* RAIDZ parity op that contain the fastest methods */
43 static raidz_impl_ops_t vdev_raidz_fastest_impl
= {
47 /* All compiled in implementations */
48 const raidz_impl_ops_t
*raidz_all_maths
[] = {
49 &vdev_raidz_original_impl
,
50 &vdev_raidz_scalar_impl
,
51 #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
52 &vdev_raidz_sse2_impl
,
54 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
55 &vdev_raidz_ssse3_impl
,
57 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
58 &vdev_raidz_avx2_impl
,
60 #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */
61 &vdev_raidz_avx512f_impl
,
63 #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
64 &vdev_raidz_avx512bw_impl
,
66 #if defined(__aarch64__)
67 &vdev_raidz_aarch64_neon_impl
,
68 &vdev_raidz_aarch64_neonx2_impl
,
72 /* Indicate that benchmark has been completed */
73 static boolean_t raidz_math_initialized
= B_FALSE
;
75 /* Select raidz implementation */
76 #define IMPL_FASTEST (UINT32_MAX)
77 #define IMPL_CYCLE (UINT32_MAX - 1)
78 #define IMPL_ORIGINAL (0)
79 #define IMPL_SCALAR (1)
81 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
83 static uint32_t zfs_vdev_raidz_impl
= IMPL_SCALAR
;
84 static uint32_t user_sel_impl
= IMPL_FASTEST
;
86 /* Hold all supported implementations */
87 static size_t raidz_supp_impl_cnt
= 0;
88 static raidz_impl_ops_t
*raidz_supp_impl
[ARRAY_SIZE(raidz_all_maths
)];
91 * kstats values for supported implementations
92 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
94 static raidz_impl_kstat_t raidz_impl_kstats
[ARRAY_SIZE(raidz_all_maths
) + 1];
96 /* kstat for benchmarked implementations */
97 static kstat_t
*raidz_math_kstat
= NULL
;
100 * Selects the raidz operation for raidz_map
101 * If rm_ops is set to NULL original raidz implementation will be used
104 vdev_raidz_math_get_ops()
106 raidz_impl_ops_t
*ops
= NULL
;
107 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
111 ASSERT(raidz_math_initialized
);
112 ops
= &vdev_raidz_fastest_impl
;
114 #if !defined(_KERNEL)
117 ASSERT(raidz_math_initialized
);
118 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
119 /* Cycle through all supported implementations */
120 static size_t cycle_impl_idx
= 0;
121 size_t idx
= (++cycle_impl_idx
) % raidz_supp_impl_cnt
;
122 ops
= raidz_supp_impl
[idx
];
127 ops
= (raidz_impl_ops_t
*)&vdev_raidz_original_impl
;
130 ops
= (raidz_impl_ops_t
*)&vdev_raidz_scalar_impl
;
133 ASSERT3U(impl
, <, raidz_supp_impl_cnt
);
134 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
135 if (impl
< ARRAY_SIZE(raidz_all_maths
))
136 ops
= raidz_supp_impl
[impl
];
140 ASSERT3P(ops
, !=, NULL
);
146 * Select parity generation method for raidz_map
149 vdev_raidz_math_generate(raidz_map_t
*rm
)
151 raidz_gen_f gen_parity
= NULL
;
153 switch (raidz_parity(rm
)) {
155 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_P
];
158 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQ
];
161 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQR
];
165 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
170 /* if method is NULL execute the original implementation */
171 if (gen_parity
== NULL
)
172 return (RAIDZ_ORIGINAL_IMPL
);
180 reconstruct_fun_p_sel(raidz_map_t
*rm
, const int *parity_valid
,
183 if (nbaddata
== 1 && parity_valid
[CODE_P
]) {
184 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
186 return ((raidz_rec_f
) NULL
);
190 reconstruct_fun_pq_sel(raidz_map_t
*rm
, const int *parity_valid
,
194 if (parity_valid
[CODE_P
]) {
195 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
196 } else if (parity_valid
[CODE_Q
]) {
197 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
199 } else if (nbaddata
== 2 &&
200 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
201 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
203 return ((raidz_rec_f
) NULL
);
207 reconstruct_fun_pqr_sel(raidz_map_t
*rm
, const int *parity_valid
,
211 if (parity_valid
[CODE_P
]) {
212 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
213 } else if (parity_valid
[CODE_Q
]) {
214 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
215 } else if (parity_valid
[CODE_R
]) {
216 return (rm
->rm_ops
->rec
[RAIDZ_REC_R
]);
218 } else if (nbaddata
== 2) {
219 if (parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
220 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
221 } else if (parity_valid
[CODE_P
] && parity_valid
[CODE_R
]) {
222 return (rm
->rm_ops
->rec
[RAIDZ_REC_PR
]);
223 } else if (parity_valid
[CODE_Q
] && parity_valid
[CODE_R
]) {
224 return (rm
->rm_ops
->rec
[RAIDZ_REC_QR
]);
226 } else if (nbaddata
== 3 &&
227 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
] &&
228 parity_valid
[CODE_R
]) {
229 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQR
]);
231 return ((raidz_rec_f
) NULL
);
235 * Select data reconstruction method for raidz_map
236 * @parity_valid - Parity validity flag
237 * @dt - Failed data index array
238 * @nbaddata - Number of failed data columns
241 vdev_raidz_math_reconstruct(raidz_map_t
*rm
, const int *parity_valid
,
242 const int *dt
, const int nbaddata
)
244 raidz_rec_f rec_fn
= NULL
;
246 switch (raidz_parity(rm
)) {
248 rec_fn
= reconstruct_fun_p_sel(rm
, parity_valid
, nbaddata
);
251 rec_fn
= reconstruct_fun_pq_sel(rm
, parity_valid
, nbaddata
);
254 rec_fn
= reconstruct_fun_pqr_sel(rm
, parity_valid
, nbaddata
);
257 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
263 return (RAIDZ_ORIGINAL_IMPL
);
265 return (rec_fn(rm
, dt
));
268 const char *raidz_gen_name
[] = {
269 "gen_p", "gen_pq", "gen_pqr"
271 const char *raidz_rec_name
[] = {
272 "rec_p", "rec_q", "rec_r",
273 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
276 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
279 raidz_math_kstat_headers(char *buf
, size_t size
)
284 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
286 off
= snprintf(buf
, size
, "%-17s", "implementation");
288 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
289 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
292 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
293 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
296 (void) snprintf(buf
+ off
, size
- off
, "\n");
302 raidz_math_kstat_data(char *buf
, size_t size
, void *data
)
304 raidz_impl_kstat_t
*fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
305 raidz_impl_kstat_t
*cstat
= (raidz_impl_kstat_t
*)data
;
309 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
311 if (cstat
== fstat
) {
312 off
+= snprintf(buf
+ off
, size
- off
, "%-17s", "fastest");
314 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++) {
315 int id
= fstat
->gen
[i
];
316 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
317 raidz_supp_impl
[id
]->name
);
319 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++) {
320 int id
= fstat
->rec
[i
];
321 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
322 raidz_supp_impl
[id
]->name
);
325 ptrdiff_t id
= cstat
- raidz_impl_kstats
;
327 off
+= snprintf(buf
+ off
, size
- off
, "%-17s",
328 raidz_supp_impl
[id
]->name
);
330 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
331 off
+= snprintf(buf
+ off
, size
- off
, "%-16llu",
332 (u_longlong_t
)cstat
->gen
[i
]);
334 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
335 off
+= snprintf(buf
+ off
, size
- off
, "%-16llu",
336 (u_longlong_t
)cstat
->rec
[i
]);
339 (void) snprintf(buf
+ off
, size
- off
, "\n");
345 raidz_math_kstat_addr(kstat_t
*ksp
, loff_t n
)
347 if (n
<= raidz_supp_impl_cnt
)
348 ksp
->ks_private
= (void *) (raidz_impl_kstats
+ n
);
350 ksp
->ks_private
= NULL
;
352 return (ksp
->ks_private
);
355 #define BENCH_D_COLS (8ULL)
356 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
357 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
358 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
360 typedef void (*benchmark_fn
)(raidz_map_t
*rm
, const int fn
);
363 benchmark_gen_impl(raidz_map_t
*rm
, const int fn
)
366 vdev_raidz_generate_parity(rm
);
370 benchmark_rec_impl(raidz_map_t
*rm
, const int fn
)
372 static const int rec_tgt
[7][3] = {
373 {1, 2, 3}, /* rec_p: bad QR & D[0] */
374 {0, 2, 3}, /* rec_q: bad PR & D[0] */
375 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
376 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
377 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
378 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
379 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
382 vdev_raidz_reconstruct(rm
, rec_tgt
[fn
], 3);
386 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
387 * is performed by setting the rm_ops pointer and calling the top level
388 * generate/reconstruct methods of bench_rm.
391 benchmark_raidz_impl(raidz_map_t
*bench_rm
, const int fn
, benchmark_fn bench_fn
)
393 uint64_t run_cnt
, speed
, best_speed
= 0;
394 hrtime_t t_start
, t_diff
;
395 raidz_impl_ops_t
*curr_impl
;
396 raidz_impl_kstat_t
*fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
399 for (impl
= 0; impl
< raidz_supp_impl_cnt
; impl
++) {
400 /* set an implementation to benchmark */
401 curr_impl
= raidz_supp_impl
[impl
];
402 bench_rm
->rm_ops
= curr_impl
;
405 t_start
= gethrtime();
408 for (i
= 0; i
< 25; i
++, run_cnt
++)
409 bench_fn(bench_rm
, fn
);
411 t_diff
= gethrtime() - t_start
;
412 } while (t_diff
< BENCH_NS
);
414 speed
= run_cnt
* BENCH_ZIO_SIZE
* NANOSEC
;
415 speed
/= (t_diff
* BENCH_COLS
);
417 if (bench_fn
== benchmark_gen_impl
)
418 raidz_impl_kstats
[impl
].gen
[fn
] = speed
;
420 raidz_impl_kstats
[impl
].rec
[fn
] = speed
;
422 /* Update fastest implementation method */
423 if (speed
> best_speed
) {
426 if (bench_fn
== benchmark_gen_impl
) {
427 fstat
->gen
[fn
] = impl
;
428 vdev_raidz_fastest_impl
.gen
[fn
] =
431 fstat
->rec
[fn
] = impl
;
432 vdev_raidz_fastest_impl
.rec
[fn
] =
440 vdev_raidz_math_init(void)
442 raidz_impl_ops_t
*curr_impl
;
443 zio_t
*bench_zio
= NULL
;
444 raidz_map_t
*bench_rm
= NULL
;
445 uint64_t bench_parity
;
448 /* move supported impl into raidz_supp_impl */
449 for (i
= 0, c
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
450 curr_impl
= (raidz_impl_ops_t
*)raidz_all_maths
[i
];
452 /* initialize impl */
456 if (curr_impl
->is_supported())
457 raidz_supp_impl
[c
++] = (raidz_impl_ops_t
*)curr_impl
;
459 membar_producer(); /* complete raidz_supp_impl[] init */
460 raidz_supp_impl_cnt
= c
; /* number of supported impl */
462 #if !defined(_KERNEL)
463 /* Skip benchmarking and use last implementation as fastest */
464 memcpy(&vdev_raidz_fastest_impl
, raidz_supp_impl
[raidz_supp_impl_cnt
-1],
465 sizeof (vdev_raidz_fastest_impl
));
466 strcpy(vdev_raidz_fastest_impl
.name
, "fastest");
468 raidz_math_initialized
= B_TRUE
;
470 /* Use 'cycle' math selection method for userspace */
471 VERIFY0(vdev_raidz_impl_set("cycle"));
475 /* Fake an zio and run the benchmark on a warmed up buffer */
476 bench_zio
= kmem_zalloc(sizeof (zio_t
), KM_SLEEP
);
477 bench_zio
->io_offset
= 0;
478 bench_zio
->io_size
= BENCH_ZIO_SIZE
; /* only data columns */
479 bench_zio
->io_abd
= abd_alloc_linear(BENCH_ZIO_SIZE
, B_TRUE
);
480 memset(abd_to_buf(bench_zio
->io_abd
), 0xAA, BENCH_ZIO_SIZE
);
482 /* Benchmark parity generation methods */
483 for (fn
= 0; fn
< RAIDZ_GEN_NUM
; fn
++) {
484 bench_parity
= fn
+ 1;
485 /* New raidz_map is needed for each generate_p/q/r */
486 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
487 BENCH_D_COLS
+ bench_parity
, bench_parity
);
489 benchmark_raidz_impl(bench_rm
, fn
, benchmark_gen_impl
);
491 vdev_raidz_map_free(bench_rm
);
494 /* Benchmark data reconstruction methods */
495 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
496 BENCH_COLS
, PARITY_PQR
);
498 for (fn
= 0; fn
< RAIDZ_REC_NUM
; fn
++)
499 benchmark_raidz_impl(bench_rm
, fn
, benchmark_rec_impl
);
501 vdev_raidz_map_free(bench_rm
);
503 /* cleanup the bench zio */
504 abd_free(bench_zio
->io_abd
);
505 kmem_free(bench_zio
, sizeof (zio_t
));
507 /* install kstats for all impl */
508 raidz_math_kstat
= kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
509 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
511 if (raidz_math_kstat
!= NULL
) {
512 raidz_math_kstat
->ks_data
= NULL
;
513 raidz_math_kstat
->ks_ndata
= UINT32_MAX
;
514 kstat_set_raw_ops(raidz_math_kstat
,
515 raidz_math_kstat_headers
,
516 raidz_math_kstat_data
,
517 raidz_math_kstat_addr
);
518 kstat_install(raidz_math_kstat
);
521 /* Finish initialization */
522 atomic_swap_32(&zfs_vdev_raidz_impl
, user_sel_impl
);
523 raidz_math_initialized
= B_TRUE
;
527 vdev_raidz_math_fini(void)
529 raidz_impl_ops_t
const *curr_impl
;
532 if (raidz_math_kstat
!= NULL
) {
533 kstat_delete(raidz_math_kstat
);
534 raidz_math_kstat
= NULL
;
538 for (i
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
539 curr_impl
= raidz_all_maths
[i
];
545 static const struct {
548 } math_impl_opts
[] = {
549 #if !defined(_KERNEL)
550 { "cycle", IMPL_CYCLE
},
552 { "fastest", IMPL_FASTEST
},
553 { "original", IMPL_ORIGINAL
},
554 { "scalar", IMPL_SCALAR
}
558 * Function sets desired raidz implementation.
560 * If we are called before init(), user preference will be saved in
561 * user_sel_impl, and applied in later init() call. This occurs when module
562 * parameter is specified on module load. Otherwise, directly update
563 * zfs_vdev_raidz_impl.
565 * @val Name of raidz implementation to use
569 vdev_raidz_impl_set(const char *val
)
572 char req_name
[RAIDZ_IMPL_NAME_MAX
];
573 uint32_t impl
= RAIDZ_IMPL_READ(user_sel_impl
);
577 i
= strnlen(val
, RAIDZ_IMPL_NAME_MAX
);
578 if (i
== 0 || i
== RAIDZ_IMPL_NAME_MAX
)
581 strlcpy(req_name
, val
, RAIDZ_IMPL_NAME_MAX
);
582 while (i
> 0 && !!isspace(req_name
[i
-1]))
586 /* Check mandatory options */
587 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
588 if (strcmp(req_name
, math_impl_opts
[i
].name
) == 0) {
589 impl
= math_impl_opts
[i
].sel
;
595 /* check all supported impl if init() was already called */
596 if (err
!= 0 && raidz_math_initialized
) {
597 /* check all supported implementations */
598 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
599 if (strcmp(req_name
, raidz_supp_impl
[i
]->name
) == 0) {
608 if (raidz_math_initialized
)
609 atomic_swap_32(&zfs_vdev_raidz_impl
, impl
);
611 atomic_swap_32(&user_sel_impl
, impl
);
618 #include <linux/mod_compat.h>
621 zfs_vdev_raidz_impl_set(const char *val
, zfs_kernel_param_t
*kp
)
623 return (vdev_raidz_impl_set(val
));
627 zfs_vdev_raidz_impl_get(char *buffer
, zfs_kernel_param_t
*kp
)
631 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
633 ASSERT(raidz_math_initialized
);
635 /* list mandatory options */
636 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
) - 2; i
++) {
637 fmt
= (impl
== math_impl_opts
[i
].sel
) ? "[%s] " : "%s ";
638 cnt
+= sprintf(buffer
+ cnt
, fmt
, math_impl_opts
[i
].name
);
641 /* list all supported implementations */
642 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
643 fmt
= (i
== impl
) ? "[%s] " : "%s ";
644 cnt
+= sprintf(buffer
+ cnt
, fmt
, raidz_supp_impl
[i
]->name
);
650 module_param_call(zfs_vdev_raidz_impl
, zfs_vdev_raidz_impl_set
,
651 zfs_vdev_raidz_impl_get
, NULL
, 0644);
652 MODULE_PARM_DESC(zfs_vdev_raidz_impl
, "Select raidz implementation.");