]> git.proxmox.com Git - mirror_zfs.git/blame - module/zcommon/zfs_fletcher.c
zstreamdump needs to initialize fletcher 4 support
[mirror_zfs.git] / module / zcommon / zfs_fletcher.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
9babb374 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
34dc7c2f 23 * Use is subject to license terms.
fc897b24 24 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
34dc7c2f 25 */
3c67d83a
TH
26/*
27 * Copyright 2013 Saso Kiselkov. All rights reserved.
28 */
34dc7c2f 29
9babb374
BB
30/*
31 * Fletcher Checksums
32 * ------------------
33 *
34 * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
35 * recurrence relations:
36 *
37 * a = a + f
38 * i i-1 i-1
39 *
40 * b = b + a
41 * i i-1 i
42 *
43 * c = c + b (fletcher-4 only)
44 * i i-1 i
45 *
46 * d = d + c (fletcher-4 only)
47 * i i-1 i
48 *
49 * Where
50 * a_0 = b_0 = c_0 = d_0 = 0
51 * and
52 * f_0 .. f_(n-1) are the input data.
53 *
54 * Using standard techniques, these translate into the following series:
55 *
56 * __n_ __n_
57 * \ | \ |
58 * a = > f b = > i * f
59 * n /___| n - i n /___| n - i
60 * i = 1 i = 1
61 *
62 *
63 * __n_ __n_
64 * \ | i*(i+1) \ | i*(i+1)*(i+2)
65 * c = > ------- f d = > ------------- f
66 * n /___| 2 n - i n /___| 6 n - i
67 * i = 1 i = 1
68 *
69 * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
70 * Since the additions are done mod (2^64), errors in the high bits may not
71 * be noticed. For this reason, fletcher-2 is deprecated.
72 *
73 * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
74 * A conservative estimate of how big the buffer can get before we overflow
75 * can be estimated using f_i = 0xffffffff for all i:
76 *
77 * % bc
78 * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
79 * 2264
80 * quit
81 * %
82 *
83 * So blocks of up to 2k will not overflow. Our largest block size is
84 * 128k, which has 32k 4-byte words, so we can compute the largest possible
85 * accumulators, then divide by 2^64 to figure the max amount of overflow:
86 *
87 * % bc
88 * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
89 * a/2^64;b/2^64;c/2^64;d/2^64
90 * 0
91 * 0
92 * 1365
93 * 11186858
94 * quit
95 * %
96 *
97 * So a and b cannot overflow. To make sure each bit of input has some
98 * effect on the contents of c and d, we can look at what the factors of
99 * the coefficients in the equations for c_n and d_n are. The number of 2s
100 * in the factors determines the lowest set bit in the multiplier. Running
101 * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
102 * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
103 * the 64-bit accumulators, every bit of every f_i effects every accumulator,
104 * even for 128k blocks.
105 *
106 * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
107 * we could do our calculations mod (2^32 - 1) by adding in the carries
108 * periodically, and store the number of carries in the top 32-bits.
109 *
110 * --------------------
111 * Checksum Performance
112 * --------------------
113 *
114 * There are two interesting components to checksum performance: cached and
115 * uncached performance. With cached data, fletcher-2 is about four times
116 * faster than fletcher-4. With uncached data, the performance difference is
117 * negligible, since the cost of a cache fill dominates the processing time.
118 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
119 * efficient pass over the data.
120 *
121 * In normal operation, the data which is being checksummed is in a buffer
122 * which has been filled either by:
123 *
124 * 1. a compression step, which will be mostly cached, or
125 * 2. a bcopy() or copyin(), which will be uncached (because the
126 * copy is cache-bypassing).
127 *
128 * For both cached and uncached data, both fletcher checksums are much faster
129 * than sha-256, and slower than 'off', which doesn't touch the data at all.
130 */
34dc7c2f
BB
131
132#include <sys/types.h>
133#include <sys/sysmacros.h>
134#include <sys/byteorder.h>
135#include <sys/spa.h>
fc897b24 136#include <sys/zio_checksum.h>
1eeb4562
JX
137#include <sys/zfs_context.h>
138#include <zfs_fletcher.h>
139
fc897b24 140
5bf703b8
GN
141static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
142static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
143static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
144 const void *buf, uint64_t size);
145static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
146 const void *buf, uint64_t size);
1eeb4562
JX
147static boolean_t fletcher_4_scalar_valid(void);
148
149static const fletcher_4_ops_t fletcher_4_scalar_ops = {
fc897b24 150 .init_native = fletcher_4_scalar_init,
5bf703b8 151 .fini_native = fletcher_4_scalar_fini,
fc897b24
GN
152 .compute_native = fletcher_4_scalar_native,
153 .init_byteswap = fletcher_4_scalar_init,
5bf703b8 154 .fini_byteswap = fletcher_4_scalar_fini,
1eeb4562
JX
155 .compute_byteswap = fletcher_4_scalar_byteswap,
156 .valid = fletcher_4_scalar_valid,
157 .name = "scalar"
158};
159
fc897b24
GN
160static fletcher_4_ops_t fletcher_4_fastest_impl = {
161 .name = "fastest",
162 .valid = fletcher_4_scalar_valid
163};
164
165static const fletcher_4_ops_t *fletcher_4_impls[] = {
1eeb4562 166 &fletcher_4_scalar_ops,
7f319493
RD
167 &fletcher_4_superscalar_ops,
168 &fletcher_4_superscalar4_ops,
35a76a03
TS
169#if defined(HAVE_SSE2)
170 &fletcher_4_sse2_ops,
171#endif
172#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
173 &fletcher_4_ssse3_ops,
174#endif
1eeb4562
JX
175#if defined(HAVE_AVX) && defined(HAVE_AVX2)
176 &fletcher_4_avx2_ops,
177#endif
70b258fc
GN
178#if defined(__x86_64) && defined(HAVE_AVX512F)
179 &fletcher_4_avx512f_ops,
180#endif
24cdeaf1
RD
181#if defined(__aarch64__)
182 &fletcher_4_aarch64_neon_ops,
183#endif
1eeb4562
JX
184};
185
fc897b24
GN
186/* Hold all supported implementations */
187static uint32_t fletcher_4_supp_impls_cnt = 0;
188static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
189
190/* Select fletcher4 implementation */
191#define IMPL_FASTEST (UINT32_MAX)
192#define IMPL_CYCLE (UINT32_MAX - 1)
193#define IMPL_SCALAR (0)
194
195static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
196
197#define IMPL_READ(i) (*(volatile uint32_t *) &(i))
1eeb4562
JX
198
199static struct fletcher_4_impl_selector {
fc897b24
GN
200 const char *fis_name;
201 uint32_t fis_sel;
1eeb4562 202} fletcher_4_impl_selectors[] = {
1eeb4562 203#if !defined(_KERNEL)
fc897b24 204 { "cycle", IMPL_CYCLE },
1eeb4562 205#endif
fc897b24
GN
206 { "fastest", IMPL_FASTEST },
207 { "scalar", IMPL_SCALAR }
1eeb4562
JX
208};
209
1eeb4562
JX
210static kstat_t *fletcher_4_kstat;
211
fc897b24
GN
212static struct fletcher_4_kstat {
213 uint64_t native;
214 uint64_t byteswap;
215} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
216
217/* Indicate that benchmark has been completed */
218static boolean_t fletcher_4_initialized = B_FALSE;
34dc7c2f 219
3c67d83a 220/*ARGSUSED*/
34dc7c2f 221void
3c67d83a
TH
222fletcher_2_native(const void *buf, uint64_t size,
223 const void *ctx_template, zio_cksum_t *zcp)
34dc7c2f
BB
224{
225 const uint64_t *ip = buf;
226 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
227 uint64_t a0, b0, a1, b1;
228
229 for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
230 a0 += ip[0];
231 a1 += ip[1];
232 b0 += a0;
233 b1 += a1;
234 }
235
236 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
237}
238
3c67d83a 239/*ARGSUSED*/
34dc7c2f 240void
3c67d83a
TH
241fletcher_2_byteswap(const void *buf, uint64_t size,
242 const void *ctx_template, zio_cksum_t *zcp)
34dc7c2f
BB
243{
244 const uint64_t *ip = buf;
245 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
246 uint64_t a0, b0, a1, b1;
247
248 for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
249 a0 += BSWAP_64(ip[0]);
250 a1 += BSWAP_64(ip[1]);
251 b0 += a0;
252 b1 += a1;
253 }
254
255 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
256}
257
fc897b24 258static void
5bf703b8 259fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
34dc7c2f 260{
5bf703b8
GN
261 ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
262}
263
264static void
265fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
266{
267 memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
34dc7c2f
BB
268}
269
1eeb4562 270static void
5bf703b8
GN
271fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
272 uint64_t size)
34dc7c2f
BB
273{
274 const uint32_t *ip = buf;
275 const uint32_t *ipend = ip + (size / sizeof (uint32_t));
276 uint64_t a, b, c, d;
277
5bf703b8
GN
278 a = ctx->scalar.zc_word[0];
279 b = ctx->scalar.zc_word[1];
280 c = ctx->scalar.zc_word[2];
281 d = ctx->scalar.zc_word[3];
1eeb4562
JX
282
283 for (; ip < ipend; ip++) {
284 a += ip[0];
34dc7c2f
BB
285 b += a;
286 c += b;
287 d += c;
288 }
289
5bf703b8 290 ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
34dc7c2f
BB
291}
292
1eeb4562 293static void
5bf703b8
GN
294fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
295 uint64_t size)
34dc7c2f
BB
296{
297 const uint32_t *ip = buf;
298 const uint32_t *ipend = ip + (size / sizeof (uint32_t));
299 uint64_t a, b, c, d;
300
5bf703b8
GN
301 a = ctx->scalar.zc_word[0];
302 b = ctx->scalar.zc_word[1];
303 c = ctx->scalar.zc_word[2];
304 d = ctx->scalar.zc_word[3];
34dc7c2f
BB
305
306 for (; ip < ipend; ip++) {
1eeb4562 307 a += BSWAP_32(ip[0]);
34dc7c2f
BB
308 b += a;
309 c += b;
310 d += c;
311 }
312
5bf703b8 313 ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
34dc7c2f
BB
314}
315
1eeb4562
JX
316static boolean_t
317fletcher_4_scalar_valid(void)
318{
319 return (B_TRUE);
320}
321
322int
323fletcher_4_impl_set(const char *val)
324{
fc897b24
GN
325 int err = -EINVAL;
326 uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
327 size_t i, val_len;
1eeb4562
JX
328
329 val_len = strlen(val);
330 while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
331 val_len--;
332
fc897b24 333 /* check mandatory implementations */
1eeb4562
JX
334 for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
335 const char *name = fletcher_4_impl_selectors[i].fis_name;
336
337 if (val_len == strlen(name) &&
338 strncmp(val, name, val_len) == 0) {
fc897b24
GN
339 impl = fletcher_4_impl_selectors[i].fis_sel;
340 err = 0;
1eeb4562
JX
341 break;
342 }
343 }
1eeb4562 344
fc897b24
GN
345 if (err != 0 && fletcher_4_initialized) {
346 /* check all supported implementations */
347 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
348 const char *name = fletcher_4_supp_impls[i]->name;
1eeb4562 349
fc897b24
GN
350 if (val_len == strlen(name) &&
351 strncmp(val, name, val_len) == 0) {
352 impl = i;
353 err = 0;
354 break;
355 }
356 }
357 }
1eeb4562 358
fc897b24
GN
359 if (err == 0) {
360 atomic_swap_32(&fletcher_4_impl_chosen, impl);
361 membar_producer();
362 }
363
364 return (err);
1eeb4562
JX
365}
366
367static inline const fletcher_4_ops_t *
368fletcher_4_impl_get(void)
369{
fc897b24
GN
370 fletcher_4_ops_t *ops = NULL;
371 const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
372
373 switch (impl) {
374 case IMPL_FASTEST:
375 ASSERT(fletcher_4_initialized);
376 ops = &fletcher_4_fastest_impl;
377 break;
1eeb4562 378#if !defined(_KERNEL)
fc897b24
GN
379 case IMPL_CYCLE: {
380 ASSERT(fletcher_4_initialized);
381 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
382
383 static uint32_t cycle_count = 0;
384 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
385 ops = fletcher_4_supp_impls[idx];
1eeb4562 386 }
fc897b24 387 break;
1eeb4562 388#endif
fc897b24
GN
389 default:
390 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
391 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
392
393 ops = fletcher_4_supp_impls[impl];
394 break;
395 }
396
397 ASSERT3P(ops, !=, NULL);
398
399 return (ops);
400}
401
fc897b24 402static inline void
5bf703b8 403fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fc897b24 404{
5bf703b8
GN
405 fletcher_4_ctx_t ctx;
406 const fletcher_4_ops_t *ops = fletcher_4_impl_get();
407
408 ops->init_native(&ctx);
409 ops->compute_native(&ctx, buf, size);
410 ops->fini_native(&ctx, zcp);
1eeb4562
JX
411}
412
3c67d83a 413/*ARGSUSED*/
1eeb4562 414void
3c67d83a
TH
415fletcher_4_native(const void *buf, uint64_t size,
416 const void *ctx_template, zio_cksum_t *zcp)
1eeb4562 417{
5bf703b8 418 const uint64_t p2size = P2ALIGN(size, 64);
0dab2e84 419
fc897b24
GN
420 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
421
5bf703b8 422 if (size == 0 || p2size == 0) {
fc897b24 423 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
5bf703b8
GN
424
425 if (size > 0)
426 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
427 buf, size);
fc897b24 428 } else {
5bf703b8 429 fletcher_4_native_impl(buf, p2size, zcp);
1eeb4562 430
fc897b24 431 if (p2size < size)
5bf703b8
GN
432 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
433 (char *)buf + p2size, size - p2size);
fc897b24
GN
434 }
435}
436
437void
438fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
439{
5bf703b8
GN
440 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
441 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
fc897b24
GN
442}
443
444static inline void
5bf703b8 445fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fc897b24 446{
5bf703b8
GN
447 fletcher_4_ctx_t ctx;
448 const fletcher_4_ops_t *ops = fletcher_4_impl_get();
449
450 ops->init_byteswap(&ctx);
451 ops->compute_byteswap(&ctx, buf, size);
452 ops->fini_byteswap(&ctx, zcp);
1eeb4562
JX
453}
454
3c67d83a 455/*ARGSUSED*/
1eeb4562 456void
3c67d83a
TH
457fletcher_4_byteswap(const void *buf, uint64_t size,
458 const void *ctx_template, zio_cksum_t *zcp)
1eeb4562 459{
5bf703b8 460 const uint64_t p2size = P2ALIGN(size, 64);
0dab2e84 461
fc897b24
GN
462 ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
463
5bf703b8 464 if (size == 0 || p2size == 0) {
fc897b24 465 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
5bf703b8
GN
466
467 if (size > 0)
468 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
469 buf, size);
fc897b24 470 } else {
5bf703b8 471 fletcher_4_byteswap_impl(buf, p2size, zcp);
1eeb4562 472
fc897b24 473 if (p2size < size)
5bf703b8
GN
474 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
475 (char *)buf + p2size, size - p2size);
fc897b24 476 }
1eeb4562
JX
477}
478
37f520db
GN
479/* Incremental Fletcher 4 */
480
5bf703b8
GN
481#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)
482
37f520db
GN
483static inline void
484fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
485 const zio_cksum_t *nzcp)
486{
487 const uint64_t c1 = size / sizeof (uint32_t);
488 const uint64_t c2 = c1 * (c1 + 1) / 2;
489 const uint64_t c3 = c2 * (c1 + 2) / 3;
490
5bf703b8
GN
491 /*
492 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
493 * reason we split incremental fletcher4 computation of large buffers
494 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
495 */
496 ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
497
37f520db
GN
498 zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
499 c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
500 zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
501 c2 * zcp->zc_word[0];
502 zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
503 zcp->zc_word[0] += nzcp->zc_word[0];
504}
505
506static inline void
507fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
508 zio_cksum_t *zcp)
509{
37f520db
GN
510 while (size > 0) {
511 zio_cksum_t nzc;
5bf703b8 512 uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
37f520db
GN
513
514 if (native)
515 fletcher_4_native(buf, len, NULL, &nzc);
516 else
517 fletcher_4_byteswap(buf, len, NULL, &nzc);
518
519 fletcher_4_incremental_combine(zcp, len, &nzc);
520
521 size -= len;
522 buf += len;
523 }
524}
525
526void
527fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
528{
5bf703b8
GN
529 /* Use scalar impl to directly update cksum of small blocks */
530 if (size < SPA_MINBLOCKSIZE)
531 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
532 else
533 fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
37f520db
GN
534}
535
536void
537fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
538 zio_cksum_t *zcp)
539{
5bf703b8
GN
540 /* Use scalar impl to directly update cksum of small blocks */
541 if (size < SPA_MINBLOCKSIZE)
542 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
543 else
544 fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
37f520db
GN
545}
546
547
548/* Fletcher 4 kstats */
549
fc897b24
GN
550static int
551fletcher_4_kstat_headers(char *buf, size_t size)
1eeb4562 552{
fc897b24
GN
553 ssize_t off = 0;
554
555 off += snprintf(buf + off, size, "%-17s", "implementation");
556 off += snprintf(buf + off, size - off, "%-15s", "native");
557 (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
558
559 return (0);
1eeb4562
JX
560}
561
fc897b24
GN
562static int
563fletcher_4_kstat_data(char *buf, size_t size, void *data)
34dc7c2f 564{
fc897b24
GN
565 struct fletcher_4_kstat *fastest_stat =
566 &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
567 struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *) data;
568 ssize_t off = 0;
569
570 if (curr_stat == fastest_stat) {
571 off += snprintf(buf + off, size - off, "%-17s", "fastest");
572 off += snprintf(buf + off, size - off, "%-15s",
573 fletcher_4_supp_impls[fastest_stat->native]->name);
574 off += snprintf(buf + off, size - off, "%-15s\n",
575 fletcher_4_supp_impls[fastest_stat->byteswap]->name);
576 } else {
577 ptrdiff_t id = curr_stat - fletcher_4_stat_data;
578
579 off += snprintf(buf + off, size - off, "%-17s",
580 fletcher_4_supp_impls[id]->name);
581 off += snprintf(buf + off, size - off, "%-15llu",
582 (u_longlong_t) curr_stat->native);
583 off += snprintf(buf + off, size - off, "%-15llu\n",
584 (u_longlong_t) curr_stat->byteswap);
585 }
586
587 return (0);
1eeb4562 588}
34dc7c2f 589
fc897b24
GN
590static void *
591fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
1eeb4562 592{
fc897b24
GN
593 if (n <= fletcher_4_supp_impls_cnt)
594 ksp->ks_private = (void *) (fletcher_4_stat_data + n);
595 else
596 ksp->ks_private = NULL;
597
598 return (ksp->ks_private);
599}
600
601#define FLETCHER_4_FASTEST_FN_COPY(type, src) \
602{ \
603 fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
604 fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
605 fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
606}
607
608#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */
34dc7c2f 609
fc897b24
GN
610static void
611fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
612{
613
614 struct fletcher_4_kstat *fastest_stat =
615 &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
616 hrtime_t start;
617 uint64_t run_bw, run_time_ns, best_run = 0;
618 zio_cksum_t zc;
619 uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
620
621 zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native :
622 fletcher_4_byteswap;
1eeb4562 623
fc897b24
GN
624 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
625 struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
626 uint64_t run_count = 0;
1eeb4562 627
fc897b24
GN
628 /* temporary set an implementation */
629 fletcher_4_impl_chosen = i;
1eeb4562
JX
630
631 kpreempt_disable();
632 start = gethrtime();
1eeb4562 633 do {
fc897b24 634 for (l = 0; l < 32; l++, run_count++)
3c67d83a 635 fletcher_4_test(data, data_size, NULL, &zc);
fc897b24
GN
636
637 run_time_ns = gethrtime() - start;
638 } while (run_time_ns < FLETCHER_4_BENCH_NS);
1eeb4562
JX
639 kpreempt_enable();
640
fc897b24
GN
641 run_bw = data_size * run_count * NANOSEC;
642 run_bw /= run_time_ns; /* B/s */
643
644 if (native)
645 stat->native = run_bw;
646 else
647 stat->byteswap = run_bw;
648
649 if (run_bw > best_run) {
650 best_run = run_bw;
651
652 if (native) {
653 fastest_stat->native = i;
654 FLETCHER_4_FASTEST_FN_COPY(native,
655 fletcher_4_supp_impls[i]);
656 } else {
657 fastest_stat->byteswap = i;
658 FLETCHER_4_FASTEST_FN_COPY(byteswap,
659 fletcher_4_supp_impls[i]);
660 }
1eeb4562 661 }
fc897b24
GN
662 }
663
664 /* restore original selection */
665 atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
666}
1eeb4562 667
fc897b24
GN
668void
669fletcher_4_init(void)
670{
671 static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
672 fletcher_4_ops_t *curr_impl;
673 char *databuf;
674 int i, c;
675
676 /* move supported impl into fletcher_4_supp_impls */
677 for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
678 curr_impl = (fletcher_4_ops_t *) fletcher_4_impls[i];
679
680 if (curr_impl->valid && curr_impl->valid())
681 fletcher_4_supp_impls[c++] = curr_impl;
34dc7c2f 682 }
fc897b24
GN
683 membar_producer(); /* complete fletcher_4_supp_impls[] init */
684 fletcher_4_supp_impls_cnt = c; /* number of supported impl */
34dc7c2f 685
fc897b24
GN
686#if !defined(_KERNEL)
687 /* Skip benchmarking and use last implementation as fastest */
688 memcpy(&fletcher_4_fastest_impl,
689 fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
690 sizeof (fletcher_4_fastest_impl));
691 fletcher_4_fastest_impl.name = "fastest";
692 membar_producer();
1eeb4562 693
fc897b24 694 fletcher_4_initialized = B_TRUE;
fc897b24
GN
695 return;
696#endif
697 /* Benchmark all supported implementations */
698 databuf = vmem_alloc(data_size, KM_SLEEP);
699 for (i = 0; i < data_size / sizeof (uint64_t); i++)
700 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
701
702 fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
703 fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
704
705 vmem_free(databuf, data_size);
706
707 /* install kstats for all implementations */
708 fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
709 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
1eeb4562 710 if (fletcher_4_kstat != NULL) {
fc897b24
GN
711 fletcher_4_kstat->ks_data = NULL;
712 fletcher_4_kstat->ks_ndata = UINT32_MAX;
713 kstat_set_raw_ops(fletcher_4_kstat,
714 fletcher_4_kstat_headers,
715 fletcher_4_kstat_data,
716 fletcher_4_kstat_addr);
1eeb4562
JX
717 kstat_install(fletcher_4_kstat);
718 }
fc897b24
GN
719
720 /* Finish initialization */
721 fletcher_4_initialized = B_TRUE;
1eeb4562
JX
722}
723
724void
725fletcher_4_fini(void)
726{
1eeb4562
JX
727 if (fletcher_4_kstat != NULL) {
728 kstat_delete(fletcher_4_kstat);
729 fletcher_4_kstat = NULL;
730 }
34dc7c2f 731}
c28b2279
BB
732
733#if defined(_KERNEL) && defined(HAVE_SPL)
9cc1844a 734#include <linux/mod_compat.h>
1eeb4562
JX
735
736static int
9cc1844a 737fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
1eeb4562 738{
fc897b24
GN
739 const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
740 char *fmt;
1eeb4562
JX
741 int i, cnt = 0;
742
fc897b24
GN
743 /* list fastest */
744 fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s ";
745 cnt += sprintf(buffer + cnt, fmt, "fastest");
1eeb4562 746
fc897b24
GN
747 /* list all supported implementations */
748 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
749 fmt = (i == impl) ? "[%s] " : "%s ";
750 cnt += sprintf(buffer + cnt, fmt,
751 fletcher_4_supp_impls[i]->name);
1eeb4562
JX
752 }
753
754 return (cnt);
755}
756
757static int
9cc1844a 758fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
1eeb4562
JX
759{
760 return (fletcher_4_impl_set(val));
761}
762
763/*
764 * Choose a fletcher 4 implementation in ZFS.
fc897b24 765 * Users can choose "cycle" to exercise all implementations, but this is
1eeb4562
JX
766 * for testing purpose therefore it can only be set in user space.
767 */
768module_param_call(zfs_fletcher_4_impl,
769 fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
fc897b24 770MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
1eeb4562
JX
771
772EXPORT_SYMBOL(fletcher_4_init);
773EXPORT_SYMBOL(fletcher_4_fini);
c28b2279
BB
774EXPORT_SYMBOL(fletcher_2_native);
775EXPORT_SYMBOL(fletcher_2_byteswap);
776EXPORT_SYMBOL(fletcher_4_native);
fc897b24 777EXPORT_SYMBOL(fletcher_4_native_varsize);
c28b2279
BB
778EXPORT_SYMBOL(fletcher_4_byteswap);
779EXPORT_SYMBOL(fletcher_4_incremental_native);
780EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
781#endif