4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 extern boolean_t
raidz_will_scalar_work(void);
36 /* Opaque implementation with NULL methods to represent original methods */
37 static const raidz_impl_ops_t vdev_raidz_original_impl
= {
39 .is_supported
= raidz_will_scalar_work
,
42 /* RAIDZ parity op that contain the fastest methods */
43 static raidz_impl_ops_t vdev_raidz_fastest_impl
= {
47 /* All compiled in implementations */
48 const raidz_impl_ops_t
*raidz_all_maths
[] = {
49 &vdev_raidz_original_impl
,
50 &vdev_raidz_scalar_impl
,
51 #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
52 &vdev_raidz_sse2_impl
,
54 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
55 &vdev_raidz_ssse3_impl
,
57 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
58 &vdev_raidz_avx2_impl
,
60 #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */
61 &vdev_raidz_avx512f_impl
,
63 #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
64 &vdev_raidz_avx512bw_impl
,
66 #if defined(__aarch64__)
67 &vdev_raidz_aarch64_neon_impl
,
68 &vdev_raidz_aarch64_neonx2_impl
,
72 /* Indicate that benchmark has been completed */
73 static boolean_t raidz_math_initialized
= B_FALSE
;
75 /* Select raidz implementation */
76 #define IMPL_FASTEST (UINT32_MAX)
77 #define IMPL_CYCLE (UINT32_MAX - 1)
78 #define IMPL_ORIGINAL (0)
79 #define IMPL_SCALAR (1)
81 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
83 static uint32_t zfs_vdev_raidz_impl
= IMPL_SCALAR
;
84 static uint32_t user_sel_impl
= IMPL_FASTEST
;
86 /* Hold all supported implementations */
87 static size_t raidz_supp_impl_cnt
= 0;
88 static raidz_impl_ops_t
*raidz_supp_impl
[ARRAY_SIZE(raidz_all_maths
)];
91 * kstats values for supported implementations
92 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
94 static raidz_impl_kstat_t raidz_impl_kstats
[ARRAY_SIZE(raidz_all_maths
) + 1];
96 /* kstat for benchmarked implementations */
97 static kstat_t
*raidz_math_kstat
= NULL
;
100 * Selects the raidz operation for raidz_map
101 * If rm_ops is set to NULL original raidz implementation will be used
104 vdev_raidz_math_get_ops()
106 raidz_impl_ops_t
*ops
= NULL
;
107 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
111 ASSERT(raidz_math_initialized
);
112 ops
= &vdev_raidz_fastest_impl
;
114 #if !defined(_KERNEL)
117 ASSERT(raidz_math_initialized
);
118 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
119 /* Cycle through all supported implementations */
120 static size_t cycle_impl_idx
= 0;
121 size_t idx
= (++cycle_impl_idx
) % raidz_supp_impl_cnt
;
122 ops
= raidz_supp_impl
[idx
];
127 ops
= (raidz_impl_ops_t
*)&vdev_raidz_original_impl
;
130 ops
= (raidz_impl_ops_t
*)&vdev_raidz_scalar_impl
;
133 ASSERT3U(impl
, <, raidz_supp_impl_cnt
);
134 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
135 ops
= raidz_supp_impl
[impl
];
139 ASSERT3P(ops
, !=, NULL
);
145 * Select parity generation method for raidz_map
148 vdev_raidz_math_generate(raidz_map_t
*rm
)
150 raidz_gen_f gen_parity
= NULL
;
152 switch (raidz_parity(rm
)) {
154 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_P
];
157 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQ
];
160 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQR
];
164 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
169 /* if method is NULL execute the original implementation */
170 if (gen_parity
== NULL
)
171 return (RAIDZ_ORIGINAL_IMPL
);
179 reconstruct_fun_p_sel(raidz_map_t
*rm
, const int *parity_valid
,
182 if (nbaddata
== 1 && parity_valid
[CODE_P
]) {
183 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
185 return ((raidz_rec_f
) NULL
);
189 reconstruct_fun_pq_sel(raidz_map_t
*rm
, const int *parity_valid
,
193 if (parity_valid
[CODE_P
]) {
194 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
195 } else if (parity_valid
[CODE_Q
]) {
196 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
198 } else if (nbaddata
== 2 &&
199 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
200 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
202 return ((raidz_rec_f
) NULL
);
206 reconstruct_fun_pqr_sel(raidz_map_t
*rm
, const int *parity_valid
,
210 if (parity_valid
[CODE_P
]) {
211 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
212 } else if (parity_valid
[CODE_Q
]) {
213 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
214 } else if (parity_valid
[CODE_R
]) {
215 return (rm
->rm_ops
->rec
[RAIDZ_REC_R
]);
217 } else if (nbaddata
== 2) {
218 if (parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
219 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
220 } else if (parity_valid
[CODE_P
] && parity_valid
[CODE_R
]) {
221 return (rm
->rm_ops
->rec
[RAIDZ_REC_PR
]);
222 } else if (parity_valid
[CODE_Q
] && parity_valid
[CODE_R
]) {
223 return (rm
->rm_ops
->rec
[RAIDZ_REC_QR
]);
225 } else if (nbaddata
== 3 &&
226 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
] &&
227 parity_valid
[CODE_R
]) {
228 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQR
]);
230 return ((raidz_rec_f
) NULL
);
234 * Select data reconstruction method for raidz_map
235 * @parity_valid - Parity validity flag
236 * @dt - Failed data index array
237 * @nbaddata - Number of failed data columns
240 vdev_raidz_math_reconstruct(raidz_map_t
*rm
, const int *parity_valid
,
241 const int *dt
, const int nbaddata
)
243 raidz_rec_f rec_fn
= NULL
;
245 switch (raidz_parity(rm
)) {
247 rec_fn
= reconstruct_fun_p_sel(rm
, parity_valid
, nbaddata
);
250 rec_fn
= reconstruct_fun_pq_sel(rm
, parity_valid
, nbaddata
);
253 rec_fn
= reconstruct_fun_pqr_sel(rm
, parity_valid
, nbaddata
);
256 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
262 return (RAIDZ_ORIGINAL_IMPL
);
264 return (rec_fn(rm
, dt
));
267 const char *raidz_gen_name
[] = {
268 "gen_p", "gen_pq", "gen_pqr"
270 const char *raidz_rec_name
[] = {
271 "rec_p", "rec_q", "rec_r",
272 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
275 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
278 raidz_math_kstat_headers(char *buf
, size_t size
)
283 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
285 off
= snprintf(buf
, size
, "%-17s", "implementation");
287 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
288 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
291 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
292 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
295 (void) snprintf(buf
+ off
, size
- off
, "\n");
301 raidz_math_kstat_data(char *buf
, size_t size
, void *data
)
303 raidz_impl_kstat_t
*fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
304 raidz_impl_kstat_t
*cstat
= (raidz_impl_kstat_t
*)data
;
308 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
310 if (cstat
== fstat
) {
311 off
+= snprintf(buf
+ off
, size
- off
, "%-17s", "fastest");
313 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++) {
314 int id
= fstat
->gen
[i
];
315 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
316 raidz_supp_impl
[id
]->name
);
318 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++) {
319 int id
= fstat
->rec
[i
];
320 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
321 raidz_supp_impl
[id
]->name
);
324 ptrdiff_t id
= cstat
- raidz_impl_kstats
;
326 off
+= snprintf(buf
+ off
, size
- off
, "%-17s",
327 raidz_supp_impl
[id
]->name
);
329 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
330 off
+= snprintf(buf
+ off
, size
- off
, "%-16llu",
331 (u_longlong_t
)cstat
->gen
[i
]);
333 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
334 off
+= snprintf(buf
+ off
, size
- off
, "%-16llu",
335 (u_longlong_t
)cstat
->rec
[i
]);
338 (void) snprintf(buf
+ off
, size
- off
, "\n");
344 raidz_math_kstat_addr(kstat_t
*ksp
, loff_t n
)
346 if (n
<= raidz_supp_impl_cnt
)
347 ksp
->ks_private
= (void *) (raidz_impl_kstats
+ n
);
349 ksp
->ks_private
= NULL
;
351 return (ksp
->ks_private
);
354 #define BENCH_D_COLS (8ULL)
355 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
356 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
357 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
359 typedef void (*benchmark_fn
)(raidz_map_t
*rm
, const int fn
);
362 benchmark_gen_impl(raidz_map_t
*rm
, const int fn
)
365 vdev_raidz_generate_parity(rm
);
369 benchmark_rec_impl(raidz_map_t
*rm
, const int fn
)
371 static const int rec_tgt
[7][3] = {
372 {1, 2, 3}, /* rec_p: bad QR & D[0] */
373 {0, 2, 3}, /* rec_q: bad PR & D[0] */
374 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
375 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
376 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
377 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
378 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
381 vdev_raidz_reconstruct(rm
, rec_tgt
[fn
], 3);
385 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
386 * is performed by setting the rm_ops pointer and calling the top level
387 * generate/reconstruct methods of bench_rm.
390 benchmark_raidz_impl(raidz_map_t
*bench_rm
, const int fn
, benchmark_fn bench_fn
)
392 uint64_t run_cnt
, speed
, best_speed
= 0;
393 hrtime_t t_start
, t_diff
;
394 raidz_impl_ops_t
*curr_impl
;
395 raidz_impl_kstat_t
*fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
398 for (impl
= 0; impl
< raidz_supp_impl_cnt
; impl
++) {
399 /* set an implementation to benchmark */
400 curr_impl
= raidz_supp_impl
[impl
];
401 bench_rm
->rm_ops
= curr_impl
;
404 t_start
= gethrtime();
407 for (i
= 0; i
< 25; i
++, run_cnt
++)
408 bench_fn(bench_rm
, fn
);
410 t_diff
= gethrtime() - t_start
;
411 } while (t_diff
< BENCH_NS
);
413 speed
= run_cnt
* BENCH_ZIO_SIZE
* NANOSEC
;
414 speed
/= (t_diff
* BENCH_COLS
);
416 if (bench_fn
== benchmark_gen_impl
)
417 raidz_impl_kstats
[impl
].gen
[fn
] = speed
;
419 raidz_impl_kstats
[impl
].rec
[fn
] = speed
;
421 /* Update fastest implementation method */
422 if (speed
> best_speed
) {
425 if (bench_fn
== benchmark_gen_impl
) {
426 fstat
->gen
[fn
] = impl
;
427 vdev_raidz_fastest_impl
.gen
[fn
] =
430 fstat
->rec
[fn
] = impl
;
431 vdev_raidz_fastest_impl
.rec
[fn
] =
439 vdev_raidz_math_init(void)
441 raidz_impl_ops_t
*curr_impl
;
442 zio_t
*bench_zio
= NULL
;
443 raidz_map_t
*bench_rm
= NULL
;
444 uint64_t bench_parity
;
447 /* move supported impl into raidz_supp_impl */
448 for (i
= 0, c
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
449 curr_impl
= (raidz_impl_ops_t
*)raidz_all_maths
[i
];
451 /* initialize impl */
455 if (curr_impl
->is_supported())
456 raidz_supp_impl
[c
++] = (raidz_impl_ops_t
*)curr_impl
;
458 membar_producer(); /* complete raidz_supp_impl[] init */
459 raidz_supp_impl_cnt
= c
; /* number of supported impl */
461 #if !defined(_KERNEL)
462 /* Skip benchmarking and use last implementation as fastest */
463 memcpy(&vdev_raidz_fastest_impl
, raidz_supp_impl
[raidz_supp_impl_cnt
-1],
464 sizeof (vdev_raidz_fastest_impl
));
465 strcpy(vdev_raidz_fastest_impl
.name
, "fastest");
467 raidz_math_initialized
= B_TRUE
;
469 /* Use 'cycle' math selection method for userspace */
470 VERIFY0(vdev_raidz_impl_set("cycle"));
474 /* Fake an zio and run the benchmark on a warmed up buffer */
475 bench_zio
= kmem_zalloc(sizeof (zio_t
), KM_SLEEP
);
476 bench_zio
->io_offset
= 0;
477 bench_zio
->io_size
= BENCH_ZIO_SIZE
; /* only data columns */
478 bench_zio
->io_abd
= abd_alloc_linear(BENCH_ZIO_SIZE
, B_TRUE
);
479 memset(abd_to_buf(bench_zio
->io_abd
), 0xAA, BENCH_ZIO_SIZE
);
481 /* Benchmark parity generation methods */
482 for (fn
= 0; fn
< RAIDZ_GEN_NUM
; fn
++) {
483 bench_parity
= fn
+ 1;
484 /* New raidz_map is needed for each generate_p/q/r */
485 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
486 BENCH_D_COLS
+ bench_parity
, bench_parity
);
488 benchmark_raidz_impl(bench_rm
, fn
, benchmark_gen_impl
);
490 vdev_raidz_map_free(bench_rm
);
493 /* Benchmark data reconstruction methods */
494 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
495 BENCH_COLS
, PARITY_PQR
);
497 for (fn
= 0; fn
< RAIDZ_REC_NUM
; fn
++)
498 benchmark_raidz_impl(bench_rm
, fn
, benchmark_rec_impl
);
500 vdev_raidz_map_free(bench_rm
);
502 /* cleanup the bench zio */
503 abd_free(bench_zio
->io_abd
);
504 kmem_free(bench_zio
, sizeof (zio_t
));
506 /* install kstats for all impl */
507 raidz_math_kstat
= kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
508 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
510 if (raidz_math_kstat
!= NULL
) {
511 raidz_math_kstat
->ks_data
= NULL
;
512 raidz_math_kstat
->ks_ndata
= UINT32_MAX
;
513 kstat_set_raw_ops(raidz_math_kstat
,
514 raidz_math_kstat_headers
,
515 raidz_math_kstat_data
,
516 raidz_math_kstat_addr
);
517 kstat_install(raidz_math_kstat
);
520 /* Finish initialization */
521 atomic_swap_32(&zfs_vdev_raidz_impl
, user_sel_impl
);
522 raidz_math_initialized
= B_TRUE
;
526 vdev_raidz_math_fini(void)
528 raidz_impl_ops_t
const *curr_impl
;
531 if (raidz_math_kstat
!= NULL
) {
532 kstat_delete(raidz_math_kstat
);
533 raidz_math_kstat
= NULL
;
537 for (i
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
538 curr_impl
= raidz_all_maths
[i
];
544 static const struct {
547 } math_impl_opts
[] = {
548 #if !defined(_KERNEL)
549 { "cycle", IMPL_CYCLE
},
551 { "fastest", IMPL_FASTEST
},
552 { "original", IMPL_ORIGINAL
},
553 { "scalar", IMPL_SCALAR
}
557 * Function sets desired raidz implementation.
559 * If we are called before init(), user preference will be saved in
560 * user_sel_impl, and applied in later init() call. This occurs when module
561 * parameter is specified on module load. Otherwise, directly update
562 * zfs_vdev_raidz_impl.
564 * @val Name of raidz implementation to use
568 vdev_raidz_impl_set(const char *val
)
571 char req_name
[RAIDZ_IMPL_NAME_MAX
];
572 uint32_t impl
= RAIDZ_IMPL_READ(user_sel_impl
);
576 i
= strnlen(val
, RAIDZ_IMPL_NAME_MAX
);
577 if (i
== 0 || i
== RAIDZ_IMPL_NAME_MAX
)
580 strlcpy(req_name
, val
, RAIDZ_IMPL_NAME_MAX
);
581 while (i
> 0 && !!isspace(req_name
[i
-1]))
585 /* Check mandatory options */
586 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
587 if (strcmp(req_name
, math_impl_opts
[i
].name
) == 0) {
588 impl
= math_impl_opts
[i
].sel
;
594 /* check all supported impl if init() was already called */
595 if (err
!= 0 && raidz_math_initialized
) {
596 /* check all supported implementations */
597 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
598 if (strcmp(req_name
, raidz_supp_impl
[i
]->name
) == 0) {
607 if (raidz_math_initialized
)
608 atomic_swap_32(&zfs_vdev_raidz_impl
, impl
);
610 atomic_swap_32(&user_sel_impl
, impl
);
616 #if defined(_KERNEL) && defined(HAVE_SPL)
617 #include <linux/mod_compat.h>
620 zfs_vdev_raidz_impl_set(const char *val
, zfs_kernel_param_t
*kp
)
622 return (vdev_raidz_impl_set(val
));
626 zfs_vdev_raidz_impl_get(char *buffer
, zfs_kernel_param_t
*kp
)
630 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
632 ASSERT(raidz_math_initialized
);
634 /* list mandatory options */
635 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
) - 2; i
++) {
636 fmt
= (impl
== math_impl_opts
[i
].sel
) ? "[%s] " : "%s ";
637 cnt
+= sprintf(buffer
+ cnt
, fmt
, math_impl_opts
[i
].name
);
640 /* list all supported implementations */
641 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
642 fmt
= (i
== impl
) ? "[%s] " : "%s ";
643 cnt
+= sprintf(buffer
+ cnt
, fmt
, raidz_supp_impl
[i
]->name
);
649 module_param_call(zfs_vdev_raidz_impl
, zfs_vdev_raidz_impl_set
,
650 zfs_vdev_raidz_impl_get
, NULL
, 0644);
651 MODULE_PARM_DESC(zfs_vdev_raidz_impl
, "Select raidz implementation.");