4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 extern const raidz_impl_ops_t vdev_raidz_scalar_impl
;
35 extern const raidz_impl_ops_t vdev_raidz_sse_impl
;
36 extern const raidz_impl_ops_t vdev_raidz_avx2_impl
;
38 /* All compiled in implementations */
39 const raidz_impl_ops_t
*raidz_all_maths
[] = {
40 &vdev_raidz_scalar_impl
,
41 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
44 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
49 /* Indicate that benchmark has been completed */
50 static boolean_t raidz_math_initialized
= B_FALSE
;
52 /* Select raidz implementation */
53 static enum vdev_raidz_impl_sel
{
58 } zfs_vdev_raidz_impl
= IMPL_SCALAR
;
60 /* selected implementation and its lock */
61 static krwlock_t vdev_raidz_impl_lock
;
62 static raidz_impl_ops_t
*vdev_raidz_used_impl
=
63 (raidz_impl_ops_t
*) &vdev_raidz_scalar_impl
;
64 static boolean_t vdev_raidz_impl_user_set
= B_FALSE
;
66 /* RAIDZ op that contain the fastest routines */
67 static raidz_impl_ops_t vdev_raidz_fastest_impl
= {
71 /* Hold all supported implementations */
72 size_t raidz_supp_impl_cnt
= 1;
73 raidz_impl_ops_t
*raidz_supp_impl
[ARRAY_SIZE(raidz_all_maths
) + 1] = {
74 (raidz_impl_ops_t
*) &vdev_raidz_scalar_impl
, /* scalar is supported */
79 * kstats values for supported impl & original methods
80 * Values represent per disk throughput of 8 disk+parity raidz vdev (Bps)
82 static raidz_impl_kstat_t raidz_impl_kstats
[ARRAY_SIZE(raidz_all_maths
) + 1];
84 /* kstat for benchmarked implementations */
85 static kstat_t
*raidz_math_kstat
= NULL
;
88 * Selects the raidz operation for raidz_map
89 * If rm_ops is set to NULL original raidz implementation will be used
92 vdev_raidz_math_get_ops(raidz_map_t
*rm
)
94 rw_enter(&vdev_raidz_impl_lock
, RW_READER
);
96 rm
->rm_ops
= vdev_raidz_used_impl
;
99 if (zfs_vdev_raidz_impl
== IMPL_CYCLE
) {
100 static size_t cycle_impl_idx
= 0;
103 * Cycle through all supported new implementations, and
104 * when idx == raidz_supp_impl_cnt, use the original
106 idx
= (++cycle_impl_idx
) % (raidz_supp_impl_cnt
+ 1);
107 rm
->rm_ops
= raidz_supp_impl
[idx
];
111 rw_exit(&vdev_raidz_impl_lock
);
115 * Select parity generation method for raidz_map
118 vdev_raidz_math_generate(raidz_map_t
*rm
)
120 raidz_gen_f gen_parity
= NULL
;
122 switch (raidz_parity(rm
)) {
124 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_P
];
127 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQ
];
130 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQR
];
134 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
139 ASSERT(gen_parity
!= NULL
);
145 _reconstruct_fun_raidz1(raidz_map_t
*rm
, const int *parity_valid
,
148 if (nbaddata
== 1 && parity_valid
[CODE_P
]) {
149 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
151 return ((raidz_rec_f
) NULL
);
155 _reconstruct_fun_raidz2(raidz_map_t
*rm
, const int *parity_valid
,
159 if (parity_valid
[CODE_P
]) {
160 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
161 } else if (parity_valid
[CODE_Q
]) {
162 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
164 } else if (nbaddata
== 2 &&
165 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
166 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
168 return ((raidz_rec_f
) NULL
);
172 _reconstruct_fun_raidz3(raidz_map_t
*rm
, const int *parity_valid
,
176 if (parity_valid
[CODE_P
]) {
177 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
178 } else if (parity_valid
[CODE_Q
]) {
179 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
180 } else if (parity_valid
[CODE_R
]) {
181 return (rm
->rm_ops
->rec
[RAIDZ_REC_R
]);
183 } else if (nbaddata
== 2) {
184 if (parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
185 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
186 } else if (parity_valid
[CODE_P
] && parity_valid
[CODE_R
]) {
187 return (rm
->rm_ops
->rec
[RAIDZ_REC_PR
]);
188 } else if (parity_valid
[CODE_Q
] && parity_valid
[CODE_R
]) {
189 return (rm
->rm_ops
->rec
[RAIDZ_REC_QR
]);
191 } else if (nbaddata
== 3 &&
192 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
] &&
193 parity_valid
[CODE_R
]) {
194 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQR
]);
196 return ((raidz_rec_f
) NULL
);
200 * Select data reconstruction method for raidz_map
201 * @parity_valid - Parity validity flag
202 * @dt - Failed data index array
203 * @nbaddata - Number of failed data columns
206 vdev_raidz_math_reconstruct(raidz_map_t
*rm
, const int *parity_valid
,
207 const int *dt
, const int nbaddata
)
209 raidz_rec_f rec_data
= NULL
;
211 switch (raidz_parity(rm
)) {
213 rec_data
= _reconstruct_fun_raidz1(rm
, parity_valid
,
217 rec_data
= _reconstruct_fun_raidz2(rm
, parity_valid
,
221 rec_data
= _reconstruct_fun_raidz3(rm
, parity_valid
,
225 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
230 ASSERT(rec_data
!= NULL
);
232 return (rec_data(rm
, dt
));
235 const char *raidz_gen_name
[] = {
236 "gen_p", "gen_pq", "gen_pqr"
238 const char *raidz_rec_name
[] = {
239 "rec_p", "rec_q", "rec_r",
240 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
244 init_raidz_kstat(raidz_impl_kstat_t
*rs
, const char *name
)
247 const size_t impl_name_len
= strnlen(name
, KSTAT_STRLEN
);
248 const size_t op_name_max
= (KSTAT_STRLEN
- 2) > impl_name_len
?
249 KSTAT_STRLEN
- impl_name_len
- 2 : 0;
251 for (i
= 0; i
< RAIDZ_GEN_NUM
; i
++) {
252 strncpy(rs
->gen
[i
].name
, name
, impl_name_len
);
253 strncpy(rs
->gen
[i
].name
+ impl_name_len
, "_", 1);
254 strncpy(rs
->gen
[i
].name
+ impl_name_len
+ 1,
255 raidz_gen_name
[i
], op_name_max
);
257 rs
->gen
[i
].data_type
= KSTAT_DATA_UINT64
;
258 rs
->gen
[i
].value
.ui64
= 0;
261 for (i
= 0; i
< RAIDZ_REC_NUM
; i
++) {
262 strncpy(rs
->rec
[i
].name
, name
, impl_name_len
);
263 strncpy(rs
->rec
[i
].name
+ impl_name_len
, "_", 1);
264 strncpy(rs
->rec
[i
].name
+ impl_name_len
+ 1,
265 raidz_rec_name
[i
], op_name_max
);
267 rs
->rec
[i
].data_type
= KSTAT_DATA_UINT64
;
268 rs
->rec
[i
].value
.ui64
= 0;
272 #define BENCH_D_COLS (8ULL)
273 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
274 #define BENCH_ZIO_SIZE (2ULL << 17) /* 128 kiB */
275 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
277 typedef void (*benchmark_fn
)(raidz_map_t
*rm
, const int fn
);
280 benchmark_gen_impl(raidz_map_t
*rm
, const int fn
)
283 vdev_raidz_generate_parity(rm
);
287 benchmark_rec_impl(raidz_map_t
*rm
, const int fn
)
289 static const int rec_tgt
[7][3] = {
290 {1, 2, 3}, /* rec_p: bad QR & D[0] */
291 {0, 2, 3}, /* rec_q: bad PR & D[0] */
292 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
293 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
294 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
295 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
296 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
299 vdev_raidz_reconstruct(rm
, rec_tgt
[fn
], 3);
303 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
304 * is performed by setting the rm_ops pointer and calling the top level
305 * generate/reconstruct methods of bench_rm.
308 benchmark_raidz_impl(raidz_map_t
*bench_rm
, const int fn
, benchmark_fn bench_fn
)
310 uint64_t run_cnt
, speed
, best_speed
= 0;
311 hrtime_t t_start
, t_diff
;
312 raidz_impl_ops_t
*curr_impl
;
316 * Use the sentinel (NULL) from the end of raidz_supp_impl_cnt
317 * to run "original" implementation (bench_rm->rm_ops = NULL)
319 for (impl
= 0; impl
<= raidz_supp_impl_cnt
; impl
++) {
320 /* set an implementation to benchmark */
321 curr_impl
= raidz_supp_impl
[impl
];
322 bench_rm
->rm_ops
= curr_impl
;
325 t_start
= gethrtime();
328 for (i
= 0; i
< 25; i
++, run_cnt
++)
329 bench_fn(bench_rm
, fn
);
331 t_diff
= gethrtime() - t_start
;
332 } while (t_diff
< BENCH_NS
);
334 speed
= run_cnt
* BENCH_ZIO_SIZE
* NANOSEC
;
335 speed
/= (t_diff
* BENCH_COLS
);
337 if (bench_fn
== benchmark_gen_impl
)
338 raidz_impl_kstats
[impl
].gen
[fn
].value
.ui64
= speed
;
340 raidz_impl_kstats
[impl
].rec
[fn
].value
.ui64
= speed
;
342 /* if curr_impl==NULL the original impl is benchmarked */
343 if (curr_impl
!= NULL
&& speed
> best_speed
) {
346 if (bench_fn
== benchmark_gen_impl
)
347 vdev_raidz_fastest_impl
.gen
[fn
] =
350 vdev_raidz_fastest_impl
.rec
[fn
] =
357 vdev_raidz_math_init(void)
359 raidz_impl_ops_t
*curr_impl
;
360 zio_t
*bench_zio
= NULL
;
361 raidz_map_t
*bench_rm
= NULL
;
362 uint64_t bench_parity
;
365 /* init & vdev_raidz_impl_lock */
366 rw_init(&vdev_raidz_impl_lock
, NULL
, RW_DEFAULT
, NULL
);
368 /* move supported impl into raidz_supp_impl */
369 for (i
= 0, c
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
370 curr_impl
= (raidz_impl_ops_t
*) raidz_all_maths
[i
];
372 /* initialize impl */
376 if (curr_impl
->is_supported()) {
378 init_raidz_kstat(&raidz_impl_kstats
[c
],
380 raidz_supp_impl
[c
++] = (raidz_impl_ops_t
*) curr_impl
;
383 raidz_supp_impl_cnt
= c
; /* number of supported impl */
384 raidz_supp_impl
[c
] = NULL
; /* sentinel */
386 /* init kstat for original routines */
387 init_raidz_kstat(&(raidz_impl_kstats
[raidz_supp_impl_cnt
]), "original");
389 #if !defined(_KERNEL)
391 * Skip benchmarking and use last implementation as fastest
393 memcpy(&vdev_raidz_fastest_impl
, raidz_supp_impl
[raidz_supp_impl_cnt
-1],
394 sizeof (vdev_raidz_fastest_impl
));
396 vdev_raidz_fastest_impl
.name
= "fastest";
398 raidz_math_initialized
= B_TRUE
;
400 /* Use 'cycle' math selection method for userspace */
401 VERIFY0(vdev_raidz_impl_set("cycle"));
405 /* Fake an zio and run the benchmark on it */
406 bench_zio
= kmem_zalloc(sizeof (zio_t
), KM_SLEEP
);
407 bench_zio
->io_offset
= 0;
408 bench_zio
->io_size
= BENCH_ZIO_SIZE
; /* only data columns */
409 bench_zio
->io_data
= zio_data_buf_alloc(BENCH_ZIO_SIZE
);
410 VERIFY(bench_zio
->io_data
);
412 /* Benchmark parity generation methods */
413 for (fn
= 0; fn
< RAIDZ_GEN_NUM
; fn
++) {
414 bench_parity
= fn
+ 1;
415 /* New raidz_map is needed for each generate_p/q/r */
416 bench_rm
= vdev_raidz_map_alloc(bench_zio
, 9,
417 BENCH_D_COLS
+ bench_parity
, bench_parity
);
419 benchmark_raidz_impl(bench_rm
, fn
, benchmark_gen_impl
);
421 vdev_raidz_map_free(bench_rm
);
424 /* Benchmark data reconstruction methods */
425 bench_rm
= vdev_raidz_map_alloc(bench_zio
, 9, BENCH_COLS
, PARITY_PQR
);
427 for (fn
= 0; fn
< RAIDZ_REC_NUM
; fn
++)
428 benchmark_raidz_impl(bench_rm
, fn
, benchmark_rec_impl
);
430 vdev_raidz_map_free(bench_rm
);
432 /* cleanup the bench zio */
433 zio_data_buf_free(bench_zio
->io_data
, BENCH_ZIO_SIZE
);
434 kmem_free(bench_zio
, sizeof (zio_t
));
436 /* install kstats for all impl */
437 raidz_math_kstat
= kstat_create("zfs", 0, "vdev_raidz_bench",
438 "misc", KSTAT_TYPE_NAMED
,
439 sizeof (raidz_impl_kstat_t
) / sizeof (kstat_named_t
) *
440 (raidz_supp_impl_cnt
+ 1), KSTAT_FLAG_VIRTUAL
);
442 if (raidz_math_kstat
!= NULL
) {
443 raidz_math_kstat
->ks_data
= raidz_impl_kstats
;
444 kstat_install(raidz_math_kstat
);
447 /* Finish initialization */
448 raidz_math_initialized
= B_TRUE
;
449 if (!vdev_raidz_impl_user_set
)
450 VERIFY0(vdev_raidz_impl_set("fastest"));
454 vdev_raidz_math_fini(void)
456 raidz_impl_ops_t
const *curr_impl
;
459 if (raidz_math_kstat
!= NULL
) {
460 kstat_delete(raidz_math_kstat
);
461 raidz_math_kstat
= NULL
;
464 rw_destroy(&vdev_raidz_impl_lock
);
467 for (i
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
468 curr_impl
= raidz_all_maths
[i
];
478 raidz_impl_ops_t
*impl
;
479 enum vdev_raidz_impl_sel sel
;
480 } math_impl_opts
[] = {
481 { "fastest", &vdev_raidz_fastest_impl
, IMPL_FASTEST
},
482 { "original", NULL
, IMPL_ORIGINAL
},
483 #if !defined(_KERNEL)
484 { "cycle", NULL
, IMPL_CYCLE
},
489 * Function sets desired raidz implementation.
490 * If called after module_init(), vdev_raidz_impl_lock must be held for writing.
492 * @val Name of raidz implementation to use
496 zfs_vdev_raidz_impl_set(const char *val
, struct kernel_param
*kp
)
500 /* Check mandatory options */
501 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
502 if (strcmp(val
, math_impl_opts
[i
].name
) == 0) {
503 zfs_vdev_raidz_impl
= math_impl_opts
[i
].sel
;
504 vdev_raidz_used_impl
= math_impl_opts
[i
].impl
;
505 vdev_raidz_impl_user_set
= B_TRUE
;
510 /* check all supported implementations */
511 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
512 if (strcmp(val
, raidz_supp_impl
[i
]->name
) == 0) {
513 zfs_vdev_raidz_impl
= i
;
514 vdev_raidz_used_impl
= raidz_supp_impl
[i
];
515 vdev_raidz_impl_user_set
= B_TRUE
;
524 vdev_raidz_impl_set(const char *val
)
528 ASSERT(raidz_math_initialized
);
530 rw_enter(&vdev_raidz_impl_lock
, RW_WRITER
);
531 err
= zfs_vdev_raidz_impl_set(val
, NULL
);
532 rw_exit(&vdev_raidz_impl_lock
);
536 #if defined(_KERNEL) && defined(HAVE_SPL)
538 zfs_vdev_raidz_impl_get(char *buffer
, struct kernel_param
*kp
)
543 ASSERT(raidz_math_initialized
);
545 rw_enter(&vdev_raidz_impl_lock
, RW_READER
);
547 /* list mandatory options */
548 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
549 if (math_impl_opts
[i
].sel
== zfs_vdev_raidz_impl
)
554 cnt
+= sprintf(buffer
+ cnt
, fmt
, math_impl_opts
[i
].name
);
557 /* list all supported implementations */
558 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
559 fmt
= (i
== zfs_vdev_raidz_impl
) ? "[%s] " : "%s ";
560 cnt
+= sprintf(buffer
+ cnt
, fmt
, raidz_supp_impl
[i
]->name
);
563 rw_exit(&vdev_raidz_impl_lock
);
568 module_param_call(zfs_vdev_raidz_impl
, zfs_vdev_raidz_impl_set
,
569 zfs_vdev_raidz_impl_get
, NULL
, 0644);
570 MODULE_PARM_DESC(zfs_vdev_raidz_impl
, "Select raidz implementation.");