4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 extern boolean_t
raidz_will_scalar_work(void);
36 /* Opaque implementation with NULL methods to represent original methods */
37 static const raidz_impl_ops_t vdev_raidz_original_impl
= {
39 .is_supported
= raidz_will_scalar_work
,
42 /* RAIDZ parity op that contain the fastest methods */
43 static raidz_impl_ops_t vdev_raidz_fastest_impl
= {
47 /* All compiled in implementations */
48 const raidz_impl_ops_t
*raidz_all_maths
[] = {
49 &vdev_raidz_original_impl
,
50 &vdev_raidz_scalar_impl
,
51 #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
52 &vdev_raidz_sse2_impl
,
54 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
55 &vdev_raidz_ssse3_impl
,
57 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
58 &vdev_raidz_avx2_impl
,
60 #if defined(__aarch64__)
61 &vdev_raidz_aarch64_neon_impl
,
62 &vdev_raidz_aarch64_neonx2_impl
,
66 /* Indicate that benchmark has been completed */
67 static boolean_t raidz_math_initialized
= B_FALSE
;
69 /* Select raidz implementation */
70 #define IMPL_FASTEST (UINT32_MAX)
71 #define IMPL_CYCLE (UINT32_MAX - 1)
72 #define IMPL_ORIGINAL (0)
73 #define IMPL_SCALAR (1)
75 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
77 static uint32_t zfs_vdev_raidz_impl
= IMPL_SCALAR
;
78 static uint32_t user_sel_impl
= IMPL_FASTEST
;
80 /* Hold all supported implementations */
81 static size_t raidz_supp_impl_cnt
= 0;
82 static raidz_impl_ops_t
*raidz_supp_impl
[ARRAY_SIZE(raidz_all_maths
)];
85 * kstats values for supported implementations
86 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
88 static raidz_impl_kstat_t raidz_impl_kstats
[ARRAY_SIZE(raidz_all_maths
) + 1];
90 /* kstat for benchmarked implementations */
91 static kstat_t
*raidz_math_kstat
= NULL
;
94 * Selects the raidz operation for raidz_map
95 * If rm_ops is set to NULL original raidz implementation will be used
98 vdev_raidz_math_get_ops()
100 raidz_impl_ops_t
*ops
= NULL
;
101 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
105 ASSERT(raidz_math_initialized
);
106 ops
= &vdev_raidz_fastest_impl
;
108 #if !defined(_KERNEL)
111 ASSERT(raidz_math_initialized
);
112 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
113 /* Cycle through all supported implementations */
114 static size_t cycle_impl_idx
= 0;
115 size_t idx
= (++cycle_impl_idx
) % raidz_supp_impl_cnt
;
116 ops
= raidz_supp_impl
[idx
];
121 ops
= (raidz_impl_ops_t
*) &vdev_raidz_original_impl
;
124 ops
= (raidz_impl_ops_t
*) &vdev_raidz_scalar_impl
;
127 ASSERT3U(impl
, <, raidz_supp_impl_cnt
);
128 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
129 ops
= raidz_supp_impl
[impl
];
133 ASSERT3P(ops
, !=, NULL
);
139 * Select parity generation method for raidz_map
142 vdev_raidz_math_generate(raidz_map_t
*rm
)
144 raidz_gen_f gen_parity
= NULL
;
146 switch (raidz_parity(rm
)) {
148 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_P
];
151 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQ
];
154 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQR
];
158 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
163 /* if method is NULL execute the original implementation */
164 if (gen_parity
== NULL
)
165 return (RAIDZ_ORIGINAL_IMPL
);
173 reconstruct_fun_p_sel(raidz_map_t
*rm
, const int *parity_valid
,
176 if (nbaddata
== 1 && parity_valid
[CODE_P
]) {
177 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
179 return ((raidz_rec_f
) NULL
);
183 reconstruct_fun_pq_sel(raidz_map_t
*rm
, const int *parity_valid
,
187 if (parity_valid
[CODE_P
]) {
188 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
189 } else if (parity_valid
[CODE_Q
]) {
190 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
192 } else if (nbaddata
== 2 &&
193 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
194 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
196 return ((raidz_rec_f
) NULL
);
200 reconstruct_fun_pqr_sel(raidz_map_t
*rm
, const int *parity_valid
,
204 if (parity_valid
[CODE_P
]) {
205 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
206 } else if (parity_valid
[CODE_Q
]) {
207 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
208 } else if (parity_valid
[CODE_R
]) {
209 return (rm
->rm_ops
->rec
[RAIDZ_REC_R
]);
211 } else if (nbaddata
== 2) {
212 if (parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
213 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
214 } else if (parity_valid
[CODE_P
] && parity_valid
[CODE_R
]) {
215 return (rm
->rm_ops
->rec
[RAIDZ_REC_PR
]);
216 } else if (parity_valid
[CODE_Q
] && parity_valid
[CODE_R
]) {
217 return (rm
->rm_ops
->rec
[RAIDZ_REC_QR
]);
219 } else if (nbaddata
== 3 &&
220 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
] &&
221 parity_valid
[CODE_R
]) {
222 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQR
]);
224 return ((raidz_rec_f
) NULL
);
228 * Select data reconstruction method for raidz_map
229 * @parity_valid - Parity validity flag
230 * @dt - Failed data index array
231 * @nbaddata - Number of failed data columns
234 vdev_raidz_math_reconstruct(raidz_map_t
*rm
, const int *parity_valid
,
235 const int *dt
, const int nbaddata
)
237 raidz_rec_f rec_data
= NULL
;
239 switch (raidz_parity(rm
)) {
241 rec_data
= reconstruct_fun_p_sel(rm
, parity_valid
, nbaddata
);
244 rec_data
= reconstruct_fun_pq_sel(rm
, parity_valid
, nbaddata
);
247 rec_data
= reconstruct_fun_pqr_sel(rm
, parity_valid
, nbaddata
);
250 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
255 if (rec_data
== NULL
)
256 return (RAIDZ_ORIGINAL_IMPL
);
258 return (rec_data(rm
, dt
));
261 const char *raidz_gen_name
[] = {
262 "gen_p", "gen_pq", "gen_pqr"
264 const char *raidz_rec_name
[] = {
265 "rec_p", "rec_q", "rec_r",
266 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
269 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
272 raidz_math_kstat_headers(char *buf
, size_t size
)
277 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
279 off
= snprintf(buf
, size
, "%-17s", "implementation");
281 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
282 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
285 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
286 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
289 (void) snprintf(buf
+ off
, size
- off
, "\n");
295 raidz_math_kstat_data(char *buf
, size_t size
, void *data
)
297 raidz_impl_kstat_t
* fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
298 raidz_impl_kstat_t
* cstat
= (raidz_impl_kstat_t
*) data
;
302 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
304 if (cstat
== fstat
) {
305 off
+= snprintf(buf
+ off
, size
- off
, "%-17s", "fastest");
307 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++) {
308 int id
= fstat
->gen
[i
];
309 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
310 raidz_supp_impl
[id
]->name
);
312 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++) {
313 int id
= fstat
->rec
[i
];
314 off
+= snprintf(buf
+ off
, size
- off
, "%-16s",
315 raidz_supp_impl
[id
]->name
);
318 ptrdiff_t id
= cstat
- raidz_impl_kstats
;
320 off
+= snprintf(buf
+ off
, size
- off
, "%-17s",
321 raidz_supp_impl
[id
]->name
);
323 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
324 off
+= snprintf(buf
+ off
, size
- off
, "%-16llu",
325 (u_longlong_t
) cstat
->gen
[i
]);
327 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
328 off
+= snprintf(buf
+ off
, size
- off
, "%-16llu",
329 (u_longlong_t
) cstat
->rec
[i
]);
332 (void) snprintf(buf
+ off
, size
- off
, "\n");
338 raidz_math_kstat_addr(kstat_t
*ksp
, loff_t n
)
340 if (n
<= raidz_supp_impl_cnt
)
341 ksp
->ks_private
= (void *) (raidz_impl_kstats
+ n
);
343 ksp
->ks_private
= NULL
;
345 return (ksp
->ks_private
);
348 #define BENCH_D_COLS (8ULL)
349 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
350 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
351 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
353 typedef void (*benchmark_fn
)(raidz_map_t
*rm
, const int fn
);
356 benchmark_gen_impl(raidz_map_t
*rm
, const int fn
)
359 vdev_raidz_generate_parity(rm
);
363 benchmark_rec_impl(raidz_map_t
*rm
, const int fn
)
365 static const int rec_tgt
[7][3] = {
366 {1, 2, 3}, /* rec_p: bad QR & D[0] */
367 {0, 2, 3}, /* rec_q: bad PR & D[0] */
368 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
369 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
370 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
371 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
372 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
375 vdev_raidz_reconstruct(rm
, rec_tgt
[fn
], 3);
379 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
380 * is performed by setting the rm_ops pointer and calling the top level
381 * generate/reconstruct methods of bench_rm.
384 benchmark_raidz_impl(raidz_map_t
*bench_rm
, const int fn
, benchmark_fn bench_fn
)
386 uint64_t run_cnt
, speed
, best_speed
= 0;
387 hrtime_t t_start
, t_diff
;
388 raidz_impl_ops_t
*curr_impl
;
389 raidz_impl_kstat_t
* fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
392 for (impl
= 0; impl
< raidz_supp_impl_cnt
; impl
++) {
393 /* set an implementation to benchmark */
394 curr_impl
= raidz_supp_impl
[impl
];
395 bench_rm
->rm_ops
= curr_impl
;
398 t_start
= gethrtime();
401 for (i
= 0; i
< 25; i
++, run_cnt
++)
402 bench_fn(bench_rm
, fn
);
404 t_diff
= gethrtime() - t_start
;
405 } while (t_diff
< BENCH_NS
);
407 speed
= run_cnt
* BENCH_ZIO_SIZE
* NANOSEC
;
408 speed
/= (t_diff
* BENCH_COLS
);
410 if (bench_fn
== benchmark_gen_impl
)
411 raidz_impl_kstats
[impl
].gen
[fn
] = speed
;
413 raidz_impl_kstats
[impl
].rec
[fn
] = speed
;
415 /* Update fastest implementation method */
416 if (speed
> best_speed
) {
419 if (bench_fn
== benchmark_gen_impl
) {
420 fstat
->gen
[fn
] = impl
;
421 vdev_raidz_fastest_impl
.gen
[fn
] =
424 fstat
->rec
[fn
] = impl
;
425 vdev_raidz_fastest_impl
.rec
[fn
] =
433 vdev_raidz_math_init(void)
435 raidz_impl_ops_t
*curr_impl
;
436 zio_t
*bench_zio
= NULL
;
437 raidz_map_t
*bench_rm
= NULL
;
438 uint64_t bench_parity
;
441 /* move supported impl into raidz_supp_impl */
442 for (i
= 0, c
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
443 curr_impl
= (raidz_impl_ops_t
*) raidz_all_maths
[i
];
445 /* initialize impl */
449 if (curr_impl
->is_supported())
450 raidz_supp_impl
[c
++] = (raidz_impl_ops_t
*) curr_impl
;
452 membar_producer(); /* complete raidz_supp_impl[] init */
453 raidz_supp_impl_cnt
= c
; /* number of supported impl */
455 #if !defined(_KERNEL)
456 /* Skip benchmarking and use last implementation as fastest */
457 memcpy(&vdev_raidz_fastest_impl
, raidz_supp_impl
[raidz_supp_impl_cnt
-1],
458 sizeof (vdev_raidz_fastest_impl
));
459 strcpy(vdev_raidz_fastest_impl
.name
, "fastest");
461 raidz_math_initialized
= B_TRUE
;
463 /* Use 'cycle' math selection method for userspace */
464 VERIFY0(vdev_raidz_impl_set("cycle"));
468 /* Fake an zio and run the benchmark on it */
469 bench_zio
= kmem_zalloc(sizeof (zio_t
), KM_SLEEP
);
470 bench_zio
->io_offset
= 0;
471 bench_zio
->io_size
= BENCH_ZIO_SIZE
; /* only data columns */
472 bench_zio
->io_data
= zio_data_buf_alloc(BENCH_ZIO_SIZE
);
473 VERIFY(bench_zio
->io_data
);
474 memset(bench_zio
->io_data
, 0xAA, BENCH_ZIO_SIZE
); /* warm up */
476 /* Benchmark parity generation methods */
477 for (fn
= 0; fn
< RAIDZ_GEN_NUM
; fn
++) {
478 bench_parity
= fn
+ 1;
479 /* New raidz_map is needed for each generate_p/q/r */
480 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
481 BENCH_D_COLS
+ bench_parity
, bench_parity
);
483 benchmark_raidz_impl(bench_rm
, fn
, benchmark_gen_impl
);
485 vdev_raidz_map_free(bench_rm
);
488 /* Benchmark data reconstruction methods */
489 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
490 BENCH_COLS
, PARITY_PQR
);
492 for (fn
= 0; fn
< RAIDZ_REC_NUM
; fn
++)
493 benchmark_raidz_impl(bench_rm
, fn
, benchmark_rec_impl
);
495 vdev_raidz_map_free(bench_rm
);
497 /* cleanup the bench zio */
498 zio_data_buf_free(bench_zio
->io_data
, BENCH_ZIO_SIZE
);
499 kmem_free(bench_zio
, sizeof (zio_t
));
501 /* install kstats for all impl */
502 raidz_math_kstat
= kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
503 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
505 if (raidz_math_kstat
!= NULL
) {
506 raidz_math_kstat
->ks_data
= NULL
;
507 raidz_math_kstat
->ks_ndata
= UINT32_MAX
;
508 kstat_set_raw_ops(raidz_math_kstat
,
509 raidz_math_kstat_headers
,
510 raidz_math_kstat_data
,
511 raidz_math_kstat_addr
);
512 kstat_install(raidz_math_kstat
);
515 /* Finish initialization */
516 atomic_swap_32(&zfs_vdev_raidz_impl
, user_sel_impl
);
517 raidz_math_initialized
= B_TRUE
;
521 vdev_raidz_math_fini(void)
523 raidz_impl_ops_t
const *curr_impl
;
526 if (raidz_math_kstat
!= NULL
) {
527 kstat_delete(raidz_math_kstat
);
528 raidz_math_kstat
= NULL
;
532 for (i
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
533 curr_impl
= raidz_all_maths
[i
];
539 static const struct {
542 } math_impl_opts
[] = {
543 #if !defined(_KERNEL)
544 { "cycle", IMPL_CYCLE
},
546 { "fastest", IMPL_FASTEST
},
547 { "original", IMPL_ORIGINAL
},
548 { "scalar", IMPL_SCALAR
}
552 * Function sets desired raidz implementation.
554 * If we are called before init(), user preference will be saved in
555 * user_sel_impl, and applied in later init() call. This occurs when module
556 * parameter is specified on module load. Otherwise, directly update
557 * zfs_vdev_raidz_impl.
559 * @val Name of raidz implementation to use
563 vdev_raidz_impl_set(const char *val
)
566 char req_name
[RAIDZ_IMPL_NAME_MAX
];
567 uint32_t impl
= RAIDZ_IMPL_READ(user_sel_impl
);
571 i
= strnlen(val
, RAIDZ_IMPL_NAME_MAX
);
572 if (i
== 0 || i
== RAIDZ_IMPL_NAME_MAX
)
575 strlcpy(req_name
, val
, RAIDZ_IMPL_NAME_MAX
);
576 while (i
> 0 && !!isspace(req_name
[i
-1]))
580 /* Check mandatory options */
581 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
582 if (strcmp(req_name
, math_impl_opts
[i
].name
) == 0) {
583 impl
= math_impl_opts
[i
].sel
;
589 /* check all supported impl if init() was already called */
590 if (err
!= 0 && raidz_math_initialized
) {
591 /* check all supported implementations */
592 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
593 if (strcmp(req_name
, raidz_supp_impl
[i
]->name
) == 0) {
602 if (raidz_math_initialized
)
603 atomic_swap_32(&zfs_vdev_raidz_impl
, impl
);
605 atomic_swap_32(&user_sel_impl
, impl
);
611 #if defined(_KERNEL) && defined(HAVE_SPL)
612 #include <linux/mod_compat.h>
615 zfs_vdev_raidz_impl_set(const char *val
, zfs_kernel_param_t
*kp
)
617 return (vdev_raidz_impl_set(val
));
621 zfs_vdev_raidz_impl_get(char *buffer
, zfs_kernel_param_t
*kp
)
625 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
627 ASSERT(raidz_math_initialized
);
629 /* list mandatory options */
630 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
) - 2; i
++) {
631 fmt
= (impl
== math_impl_opts
[i
].sel
) ? "[%s] " : "%s ";
632 cnt
+= sprintf(buffer
+ cnt
, fmt
, math_impl_opts
[i
].name
);
635 /* list all supported implementations */
636 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
637 fmt
= (i
== impl
) ? "[%s] " : "%s ";
638 cnt
+= sprintf(buffer
+ cnt
, fmt
, raidz_supp_impl
[i
]->name
);
644 module_param_call(zfs_vdev_raidz_impl
, zfs_vdev_raidz_impl_set
,
645 zfs_vdev_raidz_impl_get
, NULL
, 0644);
646 MODULE_PARM_DESC(zfs_vdev_raidz_impl
, "Select raidz implementation.");