4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 extern boolean_t
raidz_will_scalar_work(void);
36 /* Opaque implementation with NULL methods to represent original methods */
37 static const raidz_impl_ops_t vdev_raidz_original_impl
= {
39 .is_supported
= raidz_will_scalar_work
,
42 /* RAIDZ parity op that contain the fastest methods */
43 static raidz_impl_ops_t vdev_raidz_fastest_impl
= {
47 /* All compiled in implementations */
48 const raidz_impl_ops_t
*raidz_all_maths
[] = {
49 &vdev_raidz_original_impl
,
50 &vdev_raidz_scalar_impl
,
51 #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
52 &vdev_raidz_sse2_impl
,
54 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
55 &vdev_raidz_ssse3_impl
,
57 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
62 /* Indicate that benchmark has been completed */
63 static boolean_t raidz_math_initialized
= B_FALSE
;
65 /* Select raidz implementation */
66 #define IMPL_FASTEST (UINT32_MAX)
67 #define IMPL_CYCLE (UINT32_MAX - 1)
68 #define IMPL_ORIGINAL (0)
69 #define IMPL_SCALAR (1)
71 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
73 static uint32_t zfs_vdev_raidz_impl
= IMPL_SCALAR
;
74 static uint32_t user_sel_impl
= IMPL_FASTEST
;
76 /* Hold all supported implementations */
77 static size_t raidz_supp_impl_cnt
= 0;
78 static raidz_impl_ops_t
*raidz_supp_impl
[ARRAY_SIZE(raidz_all_maths
)];
81 * kstats values for supported implementations
82 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
84 static raidz_impl_kstat_t raidz_impl_kstats
[ARRAY_SIZE(raidz_all_maths
) + 1];
86 /* kstat for benchmarked implementations */
87 static kstat_t
*raidz_math_kstat
= NULL
;
90 * Selects the raidz operation for raidz_map
91 * If rm_ops is set to NULL original raidz implementation will be used
94 vdev_raidz_math_get_ops()
96 raidz_impl_ops_t
*ops
= NULL
;
97 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
101 ASSERT(raidz_math_initialized
);
102 ops
= &vdev_raidz_fastest_impl
;
104 #if !defined(_KERNEL)
107 ASSERT(raidz_math_initialized
);
108 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
109 /* Cycle through all supported implementations */
110 static size_t cycle_impl_idx
= 0;
111 size_t idx
= (++cycle_impl_idx
) % raidz_supp_impl_cnt
;
112 ops
= raidz_supp_impl
[idx
];
117 ops
= (raidz_impl_ops_t
*) &vdev_raidz_original_impl
;
120 ops
= (raidz_impl_ops_t
*) &vdev_raidz_scalar_impl
;
123 ASSERT(raidz_math_initialized
);
124 ASSERT3U(impl
, <, raidz_supp_impl_cnt
);
125 ASSERT3U(raidz_supp_impl_cnt
, >, 0);
126 ops
= raidz_supp_impl
[impl
];
130 ASSERT3P(ops
, !=, NULL
);
136 * Select parity generation method for raidz_map
139 vdev_raidz_math_generate(raidz_map_t
*rm
)
141 raidz_gen_f gen_parity
= NULL
;
143 switch (raidz_parity(rm
)) {
145 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_P
];
148 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQ
];
151 gen_parity
= rm
->rm_ops
->gen
[RAIDZ_GEN_PQR
];
155 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
160 /* if method is NULL execute the original implementation */
161 if (gen_parity
== NULL
)
162 return (RAIDZ_ORIGINAL_IMPL
);
170 reconstruct_fun_p_sel(raidz_map_t
*rm
, const int *parity_valid
,
173 if (nbaddata
== 1 && parity_valid
[CODE_P
]) {
174 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
176 return ((raidz_rec_f
) NULL
);
180 reconstruct_fun_pq_sel(raidz_map_t
*rm
, const int *parity_valid
,
184 if (parity_valid
[CODE_P
]) {
185 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
186 } else if (parity_valid
[CODE_Q
]) {
187 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
189 } else if (nbaddata
== 2 &&
190 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
191 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
193 return ((raidz_rec_f
) NULL
);
197 reconstruct_fun_pqr_sel(raidz_map_t
*rm
, const int *parity_valid
,
201 if (parity_valid
[CODE_P
]) {
202 return (rm
->rm_ops
->rec
[RAIDZ_REC_P
]);
203 } else if (parity_valid
[CODE_Q
]) {
204 return (rm
->rm_ops
->rec
[RAIDZ_REC_Q
]);
205 } else if (parity_valid
[CODE_R
]) {
206 return (rm
->rm_ops
->rec
[RAIDZ_REC_R
]);
208 } else if (nbaddata
== 2) {
209 if (parity_valid
[CODE_P
] && parity_valid
[CODE_Q
]) {
210 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQ
]);
211 } else if (parity_valid
[CODE_P
] && parity_valid
[CODE_R
]) {
212 return (rm
->rm_ops
->rec
[RAIDZ_REC_PR
]);
213 } else if (parity_valid
[CODE_Q
] && parity_valid
[CODE_R
]) {
214 return (rm
->rm_ops
->rec
[RAIDZ_REC_QR
]);
216 } else if (nbaddata
== 3 &&
217 parity_valid
[CODE_P
] && parity_valid
[CODE_Q
] &&
218 parity_valid
[CODE_R
]) {
219 return (rm
->rm_ops
->rec
[RAIDZ_REC_PQR
]);
221 return ((raidz_rec_f
) NULL
);
225 * Select data reconstruction method for raidz_map
226 * @parity_valid - Parity validity flag
227 * @dt - Failed data index array
228 * @nbaddata - Number of failed data columns
231 vdev_raidz_math_reconstruct(raidz_map_t
*rm
, const int *parity_valid
,
232 const int *dt
, const int nbaddata
)
234 raidz_rec_f rec_data
= NULL
;
236 switch (raidz_parity(rm
)) {
238 rec_data
= reconstruct_fun_p_sel(rm
, parity_valid
, nbaddata
);
241 rec_data
= reconstruct_fun_pq_sel(rm
, parity_valid
, nbaddata
);
244 rec_data
= reconstruct_fun_pqr_sel(rm
, parity_valid
, nbaddata
);
247 cmn_err(CE_PANIC
, "invalid RAID-Z configuration %d",
252 if (rec_data
== NULL
)
253 return (RAIDZ_ORIGINAL_IMPL
);
255 return (rec_data(rm
, dt
));
258 const char *raidz_gen_name
[] = {
259 "gen_p", "gen_pq", "gen_pqr"
261 const char *raidz_rec_name
[] = {
262 "rec_p", "rec_q", "rec_r",
263 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
266 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
269 raidz_math_kstat_headers(char *buf
, size_t size
)
274 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
276 off
= snprintf(buf
, size
, "%-17s", "implementation");
278 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
279 off
+= snprintf(buf
+ off
, size
- off
, "%-12s",
282 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
283 off
+= snprintf(buf
+ off
, size
- off
, "%-12s",
286 (void) snprintf(buf
+ off
, size
- off
, "\n");
292 raidz_math_kstat_data(char *buf
, size_t size
, void *data
)
294 raidz_impl_kstat_t
* fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
295 raidz_impl_kstat_t
* cstat
= (raidz_impl_kstat_t
*) data
;
299 ASSERT3U(size
, >=, RAIDZ_KSTAT_LINE_LEN
);
301 if (cstat
== fstat
) {
302 off
+= snprintf(buf
+ off
, size
- off
, "%-17s", "fastest");
304 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++) {
305 int id
= fstat
->gen
[i
];
306 off
+= snprintf(buf
+ off
, size
- off
, "%-12s",
307 raidz_supp_impl
[id
]->name
);
309 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++) {
310 int id
= fstat
->rec
[i
];
311 off
+= snprintf(buf
+ off
, size
- off
, "%-12s",
312 raidz_supp_impl
[id
]->name
);
315 ptrdiff_t id
= cstat
- raidz_impl_kstats
;
317 off
+= snprintf(buf
+ off
, size
- off
, "%-17s",
318 raidz_supp_impl
[id
]->name
);
320 for (i
= 0; i
< ARRAY_SIZE(raidz_gen_name
); i
++)
321 off
+= snprintf(buf
+ off
, size
- off
, "%-12llu",
322 (u_longlong_t
) cstat
->gen
[i
]);
324 for (i
= 0; i
< ARRAY_SIZE(raidz_rec_name
); i
++)
325 off
+= snprintf(buf
+ off
, size
- off
, "%-12llu",
326 (u_longlong_t
) cstat
->rec
[i
]);
329 (void) snprintf(buf
+ off
, size
- off
, "\n");
335 raidz_math_kstat_addr(kstat_t
*ksp
, loff_t n
)
337 if (n
<= raidz_supp_impl_cnt
)
338 ksp
->ks_private
= (void *) (raidz_impl_kstats
+ n
);
340 ksp
->ks_private
= NULL
;
342 return (ksp
->ks_private
);
345 #define BENCH_D_COLS (8ULL)
346 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
347 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
348 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
350 typedef void (*benchmark_fn
)(raidz_map_t
*rm
, const int fn
);
353 benchmark_gen_impl(raidz_map_t
*rm
, const int fn
)
356 vdev_raidz_generate_parity(rm
);
360 benchmark_rec_impl(raidz_map_t
*rm
, const int fn
)
362 static const int rec_tgt
[7][3] = {
363 {1, 2, 3}, /* rec_p: bad QR & D[0] */
364 {0, 2, 3}, /* rec_q: bad PR & D[0] */
365 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
366 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
367 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
368 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
369 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
372 vdev_raidz_reconstruct(rm
, rec_tgt
[fn
], 3);
376 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
377 * is performed by setting the rm_ops pointer and calling the top level
378 * generate/reconstruct methods of bench_rm.
381 benchmark_raidz_impl(raidz_map_t
*bench_rm
, const int fn
, benchmark_fn bench_fn
)
383 uint64_t run_cnt
, speed
, best_speed
= 0;
384 hrtime_t t_start
, t_diff
;
385 raidz_impl_ops_t
*curr_impl
;
386 raidz_impl_kstat_t
* fstat
= &raidz_impl_kstats
[raidz_supp_impl_cnt
];
389 for (impl
= 0; impl
< raidz_supp_impl_cnt
; impl
++) {
390 /* set an implementation to benchmark */
391 curr_impl
= raidz_supp_impl
[impl
];
392 bench_rm
->rm_ops
= curr_impl
;
395 t_start
= gethrtime();
398 for (i
= 0; i
< 25; i
++, run_cnt
++)
399 bench_fn(bench_rm
, fn
);
401 t_diff
= gethrtime() - t_start
;
402 } while (t_diff
< BENCH_NS
);
404 speed
= run_cnt
* BENCH_ZIO_SIZE
* NANOSEC
;
405 speed
/= (t_diff
* BENCH_COLS
);
407 if (bench_fn
== benchmark_gen_impl
)
408 raidz_impl_kstats
[impl
].gen
[fn
] = speed
;
410 raidz_impl_kstats
[impl
].rec
[fn
] = speed
;
412 /* Update fastest implementation method */
413 if (speed
> best_speed
) {
416 if (bench_fn
== benchmark_gen_impl
) {
417 fstat
->gen
[fn
] = impl
;
418 vdev_raidz_fastest_impl
.gen
[fn
] =
421 fstat
->rec
[fn
] = impl
;
422 vdev_raidz_fastest_impl
.rec
[fn
] =
430 vdev_raidz_math_init(void)
432 raidz_impl_ops_t
*curr_impl
;
433 zio_t
*bench_zio
= NULL
;
434 raidz_map_t
*bench_rm
= NULL
;
435 uint64_t bench_parity
;
438 /* move supported impl into raidz_supp_impl */
439 for (i
= 0, c
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
440 curr_impl
= (raidz_impl_ops_t
*) raidz_all_maths
[i
];
442 /* initialize impl */
446 if (curr_impl
->is_supported())
447 raidz_supp_impl
[c
++] = (raidz_impl_ops_t
*) curr_impl
;
449 membar_producer(); /* complete raidz_supp_impl[] init */
450 raidz_supp_impl_cnt
= c
; /* number of supported impl */
452 #if !defined(_KERNEL)
453 /* Skip benchmarking and use last implementation as fastest */
454 memcpy(&vdev_raidz_fastest_impl
, raidz_supp_impl
[raidz_supp_impl_cnt
-1],
455 sizeof (vdev_raidz_fastest_impl
));
456 strcpy(vdev_raidz_fastest_impl
.name
, "fastest");
458 raidz_math_initialized
= B_TRUE
;
460 /* Use 'cycle' math selection method for userspace */
461 VERIFY0(vdev_raidz_impl_set("cycle"));
465 /* Fake an zio and run the benchmark on it */
466 bench_zio
= kmem_zalloc(sizeof (zio_t
), KM_SLEEP
);
467 bench_zio
->io_offset
= 0;
468 bench_zio
->io_size
= BENCH_ZIO_SIZE
; /* only data columns */
469 bench_zio
->io_data
= zio_data_buf_alloc(BENCH_ZIO_SIZE
);
470 VERIFY(bench_zio
->io_data
);
471 memset(bench_zio
->io_data
, 0xAA, BENCH_ZIO_SIZE
); /* warm up */
473 /* Benchmark parity generation methods */
474 for (fn
= 0; fn
< RAIDZ_GEN_NUM
; fn
++) {
475 bench_parity
= fn
+ 1;
476 /* New raidz_map is needed for each generate_p/q/r */
477 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
478 BENCH_D_COLS
+ bench_parity
, bench_parity
);
480 benchmark_raidz_impl(bench_rm
, fn
, benchmark_gen_impl
);
482 vdev_raidz_map_free(bench_rm
);
485 /* Benchmark data reconstruction methods */
486 bench_rm
= vdev_raidz_map_alloc(bench_zio
, SPA_MINBLOCKSHIFT
,
487 BENCH_COLS
, PARITY_PQR
);
489 for (fn
= 0; fn
< RAIDZ_REC_NUM
; fn
++)
490 benchmark_raidz_impl(bench_rm
, fn
, benchmark_rec_impl
);
492 vdev_raidz_map_free(bench_rm
);
494 /* cleanup the bench zio */
495 zio_data_buf_free(bench_zio
->io_data
, BENCH_ZIO_SIZE
);
496 kmem_free(bench_zio
, sizeof (zio_t
));
498 /* install kstats for all impl */
499 raidz_math_kstat
= kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
500 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
502 if (raidz_math_kstat
!= NULL
) {
503 raidz_math_kstat
->ks_data
= NULL
;
504 raidz_math_kstat
->ks_ndata
= UINT32_MAX
;
505 kstat_set_raw_ops(raidz_math_kstat
,
506 raidz_math_kstat_headers
,
507 raidz_math_kstat_data
,
508 raidz_math_kstat_addr
);
509 kstat_install(raidz_math_kstat
);
512 /* Finish initialization */
513 atomic_swap_32(&zfs_vdev_raidz_impl
, user_sel_impl
);
514 raidz_math_initialized
= B_TRUE
;
518 vdev_raidz_math_fini(void)
520 raidz_impl_ops_t
const *curr_impl
;
523 if (raidz_math_kstat
!= NULL
) {
524 kstat_delete(raidz_math_kstat
);
525 raidz_math_kstat
= NULL
;
529 for (i
= 0; i
< ARRAY_SIZE(raidz_all_maths
); i
++) {
530 curr_impl
= raidz_all_maths
[i
];
536 static const struct {
539 } math_impl_opts
[] = {
540 #if !defined(_KERNEL)
541 { "cycle", IMPL_CYCLE
},
543 { "fastest", IMPL_FASTEST
},
544 { "original", IMPL_ORIGINAL
},
545 { "scalar", IMPL_SCALAR
}
549 * Function sets desired raidz implementation.
551 * If we are called before init(), user preference will be saved in
552 * user_sel_impl, and applied in later init() call. This occurs when module
553 * parameter is specified on module load. Otherwise, directly update
554 * zfs_vdev_raidz_impl.
556 * @val Name of raidz implementation to use
560 zfs_vdev_raidz_impl_set(const char *val
, struct kernel_param
*kp
)
563 char req_name
[RAIDZ_IMPL_NAME_MAX
];
564 uint32_t impl
= RAIDZ_IMPL_READ(user_sel_impl
);
568 i
= strnlen(val
, RAIDZ_IMPL_NAME_MAX
);
569 if (i
== 0 || i
== RAIDZ_IMPL_NAME_MAX
)
572 strlcpy(req_name
, val
, RAIDZ_IMPL_NAME_MAX
);
573 while (i
> 0 && !!isspace(req_name
[i
-1]))
577 /* Check mandatory options */
578 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
); i
++) {
579 if (strcmp(req_name
, math_impl_opts
[i
].name
) == 0) {
580 impl
= math_impl_opts
[i
].sel
;
586 /* check all supported impl if init() was already called */
587 if (err
!= 0 && raidz_math_initialized
) {
588 /* check all supported implementations */
589 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
590 if (strcmp(req_name
, raidz_supp_impl
[i
]->name
) == 0) {
599 if (raidz_math_initialized
)
600 atomic_swap_32(&zfs_vdev_raidz_impl
, impl
);
602 atomic_swap_32(&user_sel_impl
, impl
);
609 vdev_raidz_impl_set(const char *val
)
611 ASSERT(raidz_math_initialized
);
613 return (zfs_vdev_raidz_impl_set(val
, NULL
));
616 #if defined(_KERNEL) && defined(HAVE_SPL)
618 zfs_vdev_raidz_impl_get(char *buffer
, struct kernel_param
*kp
)
622 const uint32_t impl
= RAIDZ_IMPL_READ(zfs_vdev_raidz_impl
);
624 ASSERT(raidz_math_initialized
);
626 /* list mandatory options */
627 for (i
= 0; i
< ARRAY_SIZE(math_impl_opts
) - 2; i
++) {
628 fmt
= (impl
== math_impl_opts
[i
].sel
) ? "[%s] " : "%s ";
629 cnt
+= sprintf(buffer
+ cnt
, fmt
, math_impl_opts
[i
].name
);
632 /* list all supported implementations */
633 for (i
= 0; i
< raidz_supp_impl_cnt
; i
++) {
634 fmt
= (i
== impl
) ? "[%s] " : "%s ";
635 cnt
+= sprintf(buffer
+ cnt
, fmt
, raidz_supp_impl
[i
]->name
);
641 module_param_call(zfs_vdev_raidz_impl
, zfs_vdev_raidz_impl_set
,
642 zfs_vdev_raidz_impl_get
, NULL
, 0644);
643 MODULE_PARM_DESC(zfs_vdev_raidz_impl
, "Select raidz implementation.");